[llvm] 4f80f36 - AMDGPU: Add new metadata and expand atomicrmw fadd expansion tests
Matt Arsenault via llvm-commits
llvm-commits at lists.llvm.org
Tue Jun 25 14:43:11 PDT 2024
Author: Matt Arsenault
Date: 2024-06-25T23:42:48+02:00
New Revision: 4f80f362a5b2b0339bf702f6ff7ae14304c76185
URL: https://github.com/llvm/llvm-project/commit/4f80f362a5b2b0339bf702f6ff7ae14304c76185
DIFF: https://github.com/llvm/llvm-project/commit/4f80f362a5b2b0339bf702f6ff7ae14304c76185.diff
LOG: AMDGPU: Add new metadata and expand atomicrmw fadd expansion tests
Added:
Modified:
llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll
llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll
llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll
Removed:
################################################################################
diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll
index f8f85a56a9b2f..00a01e8d976bb 100644
--- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll
+++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll
@@ -13,8 +13,8 @@
; float
; --------------------------------------------------------------------
-define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset(ptr addrspace(7) inreg %ptr, float %val) #0 {
-; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset:
+define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode(ptr addrspace(7) inreg %ptr, float %val) #0 {
+; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: s_wait_expcnt 0x0
@@ -28,7 +28,7 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset(ptr addrspace(7)
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
-; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset:
+; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: v_mov_b32_e32 v1, s4
@@ -38,7 +38,7 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset(ptr addrspace(7)
; GFX940-NEXT: buffer_inv sc1
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset:
+; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v1, s4
@@ -49,7 +49,7 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset(ptr addrspace(7)
; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
-; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset:
+; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v2, v0
@@ -78,7 +78,7 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset(ptr addrspace(7)
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s8
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset:
+; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v1, s8
@@ -87,7 +87,7 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset(ptr addrspace(7)
; GFX90A-NEXT: buffer_wbinvl1
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
-; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset:
+; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: v_mov_b32_e32 v2, v0
@@ -114,7 +114,7 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset(ptr addrspace(7)
; GFX908-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
-; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset:
+; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v2, v0
@@ -141,7 +141,7 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset(ptr addrspace(7)
; GFX8-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset:
+; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v2, v0
@@ -168,7 +168,7 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset(ptr addrspace(7)
; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
-; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset:
+; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: v_mov_b32_e32 v2, v0
@@ -197,12 +197,12 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset(ptr addrspace(7)
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr float, ptr addrspace(7) %ptr, i32 256
- %result = atomicrmw fadd ptr addrspace(7) %gep, float %val syncscope("agent") seq_cst
+ %result = atomicrmw fadd ptr addrspace(7) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0
ret float %result
}
-define void @buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset(ptr addrspace(7) inreg %ptr, float %val) #0 {
-; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset:
+define void @buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode(ptr addrspace(7) inreg %ptr, float %val) #0 {
+; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: s_wait_expcnt 0x0
@@ -216,7 +216,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset(ptr addrspace(7)
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
-; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset:
+; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: v_mov_b32_e32 v1, s4
@@ -226,7 +226,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset(ptr addrspace(7)
; GFX940-NEXT: buffer_inv sc1
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset:
+; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v1, s4
@@ -237,7 +237,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset(ptr addrspace(7)
; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
-; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset:
+; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v1, s8
@@ -265,7 +265,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset(ptr addrspace(7)
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s8
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset:
+; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v1, s8
@@ -274,7 +274,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset(ptr addrspace(7)
; GFX90A-NEXT: buffer_wbinvl1
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
-; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset:
+; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: v_mov_b32_e32 v1, s8
@@ -283,7 +283,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset(ptr addrspace(7)
; GFX908-NEXT: buffer_wbinvl1
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
-; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset:
+; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s8
@@ -309,7 +309,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset(ptr addrspace(7)
; GFX8-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset:
+; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v1, s8
@@ -335,7 +335,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset(ptr addrspace(7)
; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
-; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset:
+; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: v_mov_b32_e32 v1, s8
@@ -363,12 +363,12 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset(ptr addrspace(7)
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr float, ptr addrspace(7) %ptr, i32 256
- %unused = atomicrmw fadd ptr addrspace(7) %gep, float %val syncscope("agent") seq_cst
+ %unused = atomicrmw fadd ptr addrspace(7) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0
ret void
}
-define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__waterfall(ptr addrspace(7) %ptr, float %val) #0 {
-; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__waterfall:
+define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode(ptr addrspace(7) %ptr, float %val) #0 {
+; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: s_wait_expcnt 0x0
@@ -401,7 +401,7 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__waterfall(ptr ad
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
-; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__waterfall:
+; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: s_mov_b64 s[2:3], exec
@@ -429,7 +429,7 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__waterfall(ptr ad
; GFX940-NEXT: buffer_inv sc1
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__waterfall:
+; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_mov_b32 s1, exec_lo
@@ -459,7 +459,7 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__waterfall(ptr ad
; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
-; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__waterfall:
+; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_add_nc_u32_e32 v9, 0x400, v4
@@ -521,7 +521,7 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__waterfall(ptr ad
; GFX10-NEXT: v_mov_b32_e32 v0, v6
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__waterfall:
+; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: s_mov_b64 s[6:7], exec
@@ -547,7 +547,7 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__waterfall(ptr ad
; GFX90A-NEXT: buffer_wbinvl1
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
-; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__waterfall:
+; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: v_add_u32_e32 v9, 0x400, v4
@@ -605,7 +605,7 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__waterfall(ptr ad
; GFX908-NEXT: v_mov_b32_e32 v0, v6
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
-; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__waterfall:
+; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x400, v4
@@ -663,7 +663,7 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__waterfall(ptr ad
; GFX8-NEXT: v_mov_b32_e32 v0, v6
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__waterfall:
+; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_add_i32_e32 v9, vcc, 0x400, v4
@@ -720,7 +720,7 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__waterfall(ptr ad
; GFX7-NEXT: v_mov_b32_e32 v0, v6
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
-; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__waterfall:
+; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__waterfall__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: v_add_i32_e32 v9, vcc, 0x400, v4
@@ -778,118 +778,68 @@ define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__waterfall(ptr ad
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr float, ptr addrspace(7) %ptr, i32 256
- %result = atomicrmw fadd ptr addrspace(7) %gep, float %val syncscope("agent") seq_cst
+ %result = atomicrmw fadd ptr addrspace(7) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0
ret float %result
}
-; --------------------------------------------------------------------
-; double
-; --------------------------------------------------------------------
-
-define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset(ptr addrspace(7) inreg %ptr, double %val) #0 {
-; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset:
+define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, float %val) #0 {
+; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_grained_memory:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: s_wait_expcnt 0x0
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
-; GFX12-NEXT: v_mov_b32_e32 v0, s4
-; GFX12-NEXT: s_addk_co_i32 s4, 0x800
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: v_mov_b32_e32 v6, s4
-; GFX12-NEXT: s_mov_b32 s4, 0
-; GFX12-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], null offen offset:2048
-; GFX12-NEXT: .LBB3_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0
+; GFX12-NEXT: v_mov_b32_e32 v1, s4
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_add_f64_e32 v[7:8], v[9:10], v[4:5]
-; GFX12-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
-; GFX12-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
-; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10]
-; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
-; GFX12-NEXT: s_cbranch_execnz .LBB3_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
-; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset:
+; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_grained_memory:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-NEXT: v_mov_b32_e32 v1, s4
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 offen offset:2048 sc0
+; GFX940-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], 0 offen offset:1024 sc0
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc1
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset:
+; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_grained_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
-; GFX11-NEXT: v_mov_b32_e32 v0, s4
-; GFX11-NEXT: s_addk_i32 s4, 0x800
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: v_mov_b32_e32 v6, s4
-; GFX11-NEXT: s_mov_b32 s4, 0
-; GFX11-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], 0 offen offset:2048
-; GFX11-NEXT: .LBB3_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0
+; GFX11-NEXT: v_mov_b32_e32 v1, s4
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_f64 v[7:8], v[9:10], v[4:5]
-; GFX11-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
-; GFX11-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
-; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], 0 offen glc
+; GFX11-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], 0 offen offset:1024 glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10]
-; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
-; GFX11-NEXT: s_cbranch_execnz .LBB3_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
-; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset:
+; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v4, v0
+; GFX10-NEXT: v_mov_b32_e32 v2, v0
; GFX10-NEXT: v_mov_b32_e32 v0, s8
-; GFX10-NEXT: v_mov_b32_e32 v5, v1
-; GFX10-NEXT: s_addk_i32 s8, 0x800
-; GFX10-NEXT: v_mov_b32_e32 v6, s8
-; GFX10-NEXT: buffer_load_dwordx2 v[0:1], v0, s[4:7], 0 offen offset:2048
+; GFX10-NEXT: s_addk_i32 s8, 0x400
+; GFX10-NEXT: v_mov_b32_e32 v3, s8
; GFX10-NEXT: s_mov_b32 s8, 0
+; GFX10-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024
; GFX10-NEXT: .LBB3_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v10, v1
-; GFX10-NEXT: v_mov_b32_e32 v9, v0
+; GFX10-NEXT: v_mov_b32_e32 v5, v0
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: v_add_f64 v[7:8], v[9:10], v[4:5]
-; GFX10-NEXT: v_mov_b32_e32 v0, v7
-; GFX10-NEXT: v_mov_b32_e32 v1, v8
-; GFX10-NEXT: v_mov_b32_e32 v2, v9
-; GFX10-NEXT: v_mov_b32_e32 v3, v10
-; GFX10-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[4:7], 0 offen glc
+; GFX10-NEXT: v_add_f32_e32 v4, v5, v2
+; GFX10-NEXT: v_mov_b32_e32 v0, v4
+; GFX10-NEXT: v_mov_b32_e32 v1, v5
+; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10]
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5
; GFX10-NEXT: s_or_b32 s8, vcc_lo, s8
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s8
; GFX10-NEXT: s_cbranch_execnz .LBB3_1
@@ -897,39 +847,35 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset(ptr addrspace(7)
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s8
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset:
+; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v2, s8
-; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[4:7], 0 offen offset:2048 glc
+; GFX90A-NEXT: v_mov_b32_e32 v1, s8
+; GFX90A-NEXT: buffer_atomic_add_f32 v0, v1, s[4:7], 0 offen offset:1024 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
-; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset:
+; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_grained_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v4, v0
+; GFX908-NEXT: v_mov_b32_e32 v2, v0
; GFX908-NEXT: v_mov_b32_e32 v0, s8
-; GFX908-NEXT: v_mov_b32_e32 v5, v1
-; GFX908-NEXT: buffer_load_dwordx2 v[0:1], v0, s[4:7], 0 offen offset:2048
-; GFX908-NEXT: s_add_i32 s10, s8, 0x800
+; GFX908-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024
+; GFX908-NEXT: s_add_i32 s10, s8, 0x400
; GFX908-NEXT: s_mov_b64 s[8:9], 0
-; GFX908-NEXT: v_mov_b32_e32 v6, s10
+; GFX908-NEXT: v_mov_b32_e32 v3, s10
; GFX908-NEXT: .LBB3_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v10, v1
-; GFX908-NEXT: v_mov_b32_e32 v9, v0
-; GFX908-NEXT: v_add_f64 v[7:8], v[9:10], v[4:5]
-; GFX908-NEXT: v_mov_b32_e32 v0, v7
-; GFX908-NEXT: v_mov_b32_e32 v1, v8
-; GFX908-NEXT: v_mov_b32_e32 v2, v9
-; GFX908-NEXT: v_mov_b32_e32 v3, v10
-; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[4:7], 0 offen glc
+; GFX908-NEXT: v_mov_b32_e32 v5, v0
+; GFX908-NEXT: v_add_f32_e32 v4, v5, v2
+; GFX908-NEXT: v_mov_b32_e32 v0, v4
+; GFX908-NEXT: v_mov_b32_e32 v1, v5
+; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
; GFX908-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
; GFX908-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX908-NEXT: s_cbranch_execnz .LBB3_1
@@ -937,30 +883,26 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset(ptr addrspace(7)
; GFX908-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
-; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset:
+; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_grained_memory:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v4, v0
+; GFX8-NEXT: v_mov_b32_e32 v2, v0
; GFX8-NEXT: v_mov_b32_e32 v0, s8
-; GFX8-NEXT: v_mov_b32_e32 v5, v1
-; GFX8-NEXT: buffer_load_dwordx2 v[0:1], v0, s[4:7], 0 offen offset:2048
-; GFX8-NEXT: s_add_i32 s10, s8, 0x800
+; GFX8-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024
+; GFX8-NEXT: s_add_i32 s10, s8, 0x400
; GFX8-NEXT: s_mov_b64 s[8:9], 0
-; GFX8-NEXT: v_mov_b32_e32 v6, s10
+; GFX8-NEXT: v_mov_b32_e32 v3, s10
; GFX8-NEXT: .LBB3_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v10, v1
-; GFX8-NEXT: v_mov_b32_e32 v9, v0
-; GFX8-NEXT: v_add_f64 v[7:8], v[9:10], v[4:5]
-; GFX8-NEXT: v_mov_b32_e32 v0, v7
-; GFX8-NEXT: v_mov_b32_e32 v1, v8
-; GFX8-NEXT: v_mov_b32_e32 v2, v9
-; GFX8-NEXT: v_mov_b32_e32 v3, v10
-; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[4:7], 0 offen glc
+; GFX8-NEXT: v_mov_b32_e32 v5, v0
+; GFX8-NEXT: v_add_f32_e32 v4, v5, v2
+; GFX8-NEXT: v_mov_b32_e32 v0, v4
+; GFX8-NEXT: v_mov_b32_e32 v1, v5
+; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
; GFX8-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
; GFX8-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX8-NEXT: s_cbranch_execnz .LBB3_1
@@ -968,30 +910,26 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset(ptr addrspace(7)
; GFX8-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset:
+; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_grained_memory:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v4, v0
+; GFX7-NEXT: v_mov_b32_e32 v2, v0
; GFX7-NEXT: v_mov_b32_e32 v0, s8
-; GFX7-NEXT: v_mov_b32_e32 v5, v1
-; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v0, s[4:7], 0 offen offset:2048
-; GFX7-NEXT: s_add_i32 s10, s8, 0x800
+; GFX7-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024
+; GFX7-NEXT: s_add_i32 s10, s8, 0x400
; GFX7-NEXT: s_mov_b64 s[8:9], 0
-; GFX7-NEXT: v_mov_b32_e32 v6, s10
+; GFX7-NEXT: v_mov_b32_e32 v3, s10
; GFX7-NEXT: .LBB3_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v10, v1
-; GFX7-NEXT: v_mov_b32_e32 v9, v0
-; GFX7-NEXT: v_add_f64 v[7:8], v[9:10], v[4:5]
-; GFX7-NEXT: v_mov_b32_e32 v0, v7
-; GFX7-NEXT: v_mov_b32_e32 v1, v8
-; GFX7-NEXT: v_mov_b32_e32 v2, v9
-; GFX7-NEXT: v_mov_b32_e32 v3, v10
-; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[4:7], 0 offen glc
+; GFX7-NEXT: v_mov_b32_e32 v5, v0
+; GFX7-NEXT: v_add_f32_e32 v4, v5, v2
+; GFX7-NEXT: v_mov_b32_e32 v0, v4
+; GFX7-NEXT: v_mov_b32_e32 v1, v5
+; GFX7-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_cbranch_execnz .LBB3_1
@@ -999,31 +937,27 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset(ptr addrspace(7)
; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
-; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset:
+; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_fine_grained_memory:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v4, v0
+; GFX6-NEXT: v_mov_b32_e32 v2, v0
; GFX6-NEXT: v_mov_b32_e32 v0, s8
-; GFX6-NEXT: v_mov_b32_e32 v5, v1
-; GFX6-NEXT: buffer_load_dwordx2 v[0:1], v0, s[4:7], 0 offen offset:2048
-; GFX6-NEXT: s_add_i32 s10, s8, 0x800
+; GFX6-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024
+; GFX6-NEXT: s_add_i32 s10, s8, 0x400
; GFX6-NEXT: s_mov_b64 s[8:9], 0
-; GFX6-NEXT: v_mov_b32_e32 v6, s10
+; GFX6-NEXT: v_mov_b32_e32 v3, s10
; GFX6-NEXT: .LBB3_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v10, v1
-; GFX6-NEXT: v_mov_b32_e32 v9, v0
-; GFX6-NEXT: v_add_f64 v[7:8], v[9:10], v[4:5]
+; GFX6-NEXT: v_mov_b32_e32 v5, v0
+; GFX6-NEXT: v_add_f32_e32 v4, v5, v2
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v0, v7
-; GFX6-NEXT: v_mov_b32_e32 v1, v8
-; GFX6-NEXT: v_mov_b32_e32 v2, v9
-; GFX6-NEXT: v_mov_b32_e32 v3, v10
-; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[4:7], 0 offen glc
+; GFX6-NEXT: v_mov_b32_e32 v0, v4
+; GFX6-NEXT: v_mov_b32_e32 v1, v5
+; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_cbranch_execnz .LBB3_1
@@ -1031,111 +965,68 @@ define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset(ptr addrspace(7)
; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
- %gep = getelementptr double, ptr addrspace(7) %ptr, i32 256
- %result = atomicrmw fadd ptr addrspace(7) %gep, double %val syncscope("agent") seq_cst
- ret double %result
+ %gep = getelementptr float, ptr addrspace(7) %ptr, i32 256
+ %result = atomicrmw fadd ptr addrspace(7) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+ ret float %result
}
-define void @buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset(ptr addrspace(7) inreg %ptr, double %val) #0 {
-; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset:
+define void @buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, float %val) #0 {
+; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_grained_memory:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: s_wait_expcnt 0x0
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v2, s4
-; GFX12-NEXT: s_addk_co_i32 s4, 0x800
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: v_mov_b32_e32 v6, s4
-; GFX12-NEXT: s_mov_b32 s4, 0
-; GFX12-NEXT: buffer_load_b64 v[4:5], v2, s[0:3], null offen offset:2048
-; GFX12-NEXT: .LBB4_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_add_f64_e32 v[2:3], v[4:5], v[0:1]
-; GFX12-NEXT: v_dual_mov_b32 v10, v5 :: v_dual_mov_b32 v9, v4
+; GFX12-NEXT: v_mov_b32_e32 v1, s4
+; GFX12-NEXT: s_wait_storecnt 0x0
+; GFX12-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], null offen offset:1024
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX12-NEXT: v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v7, v2
-; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[7:10], v6, s[0:3], null offen th:TH_ATOMIC_RETURN
-; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[4:5]
-; GFX12-NEXT: v_dual_mov_b32 v4, v7 :: v_dual_mov_b32 v5, v8
-; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
-; GFX12-NEXT: s_cbranch_execnz .LBB4_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
-; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset:
+; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_grained_memory:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_mov_b32_e32 v2, s4
+; GFX940-NEXT: v_mov_b32_e32 v1, s4
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 offen offset:2048
+; GFX940-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], 0 offen offset:1024
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc1
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset:
+; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_grained_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v2, s4
-; GFX11-NEXT: s_addk_i32 s4, 0x800
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: v_mov_b32_e32 v6, s4
-; GFX11-NEXT: s_mov_b32 s4, 0
-; GFX11-NEXT: buffer_load_b64 v[4:5], v2, s[0:3], 0 offen offset:2048
-; GFX11-NEXT: .LBB4_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1]
-; GFX11-NEXT: v_dual_mov_b32 v10, v5 :: v_dual_mov_b32 v9, v4
+; GFX11-NEXT: v_mov_b32_e32 v1, s4
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], 0 offen offset:1024
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT: v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v7, v2
-; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[7:10], v6, s[0:3], 0 offen glc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[4:5]
-; GFX11-NEXT: v_dual_mov_b32 v4, v7 :: v_dual_mov_b32 v5, v8
-; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
-; GFX11-NEXT: s_cbranch_execnz .LBB4_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
-; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset:
+; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v2, s8
-; GFX10-NEXT: s_addk_i32 s8, 0x800
-; GFX10-NEXT: v_mov_b32_e32 v6, s8
+; GFX10-NEXT: v_mov_b32_e32 v1, s8
+; GFX10-NEXT: s_addk_i32 s8, 0x400
+; GFX10-NEXT: v_mov_b32_e32 v3, s8
; GFX10-NEXT: s_mov_b32 s8, 0
-; GFX10-NEXT: buffer_load_dwordx2 v[4:5], v2, s[4:7], 0 offen offset:2048
+; GFX10-NEXT: buffer_load_dword v2, v1, s[4:7], 0 offen offset:1024
; GFX10-NEXT: .LBB4_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1]
-; GFX10-NEXT: v_mov_b32_e32 v10, v5
-; GFX10-NEXT: v_mov_b32_e32 v9, v4
+; GFX10-NEXT: v_add_f32_e32 v1, v2, v0
+; GFX10-NEXT: v_mov_b32_e32 v5, v2
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: v_mov_b32_e32 v8, v3
-; GFX10-NEXT: v_mov_b32_e32 v7, v2
-; GFX10-NEXT: buffer_atomic_cmpswap_x2 v[7:10], v6, s[4:7], 0 offen glc
+; GFX10-NEXT: v_mov_b32_e32 v4, v1
+; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[4:7], 0 offen glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[4:5]
-; GFX10-NEXT: v_mov_b32_e32 v4, v7
-; GFX10-NEXT: v_mov_b32_e32 v5, v8
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2
+; GFX10-NEXT: v_mov_b32_e32 v2, v4
; GFX10-NEXT: s_or_b32 s8, vcc_lo, s8
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s8
; GFX10-NEXT: s_cbranch_execnz .LBB4_1
@@ -1143,3521 +1034,2068 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset(ptr addrspace(7)
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s8
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset:
+; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v2, s8
-; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[4:7], 0 offen offset:2048
+; GFX90A-NEXT: v_mov_b32_e32 v1, s8
+; GFX90A-NEXT: buffer_atomic_add_f32 v0, v1, s[4:7], 0 offen offset:1024
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
-; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset:
+; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_grained_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v2, s8
-; GFX908-NEXT: buffer_load_dwordx2 v[4:5], v2, s[4:7], 0 offen offset:2048
-; GFX908-NEXT: s_add_i32 s10, s8, 0x800
-; GFX908-NEXT: s_mov_b64 s[8:9], 0
-; GFX908-NEXT: v_mov_b32_e32 v6, s10
-; GFX908-NEXT: .LBB4_1: ; %atomicrmw.start
-; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1]
-; GFX908-NEXT: v_mov_b32_e32 v10, v5
-; GFX908-NEXT: v_mov_b32_e32 v9, v4
-; GFX908-NEXT: v_mov_b32_e32 v8, v3
-; GFX908-NEXT: v_mov_b32_e32 v7, v2
-; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[7:10], v6, s[4:7], 0 offen glc
+; GFX908-NEXT: v_mov_b32_e32 v1, s8
+; GFX908-NEXT: buffer_atomic_add_f32 v0, v1, s[4:7], 0 offen offset:1024
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[7:8], v[4:5]
-; GFX908-NEXT: v_mov_b32_e32 v4, v7
-; GFX908-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX908-NEXT: v_mov_b32_e32 v5, v8
-; GFX908-NEXT: s_andn2_b64 exec, exec, s[8:9]
-; GFX908-NEXT: s_cbranch_execnz .LBB4_1
-; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX908-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
-; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset:
+; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_grained_memory:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v2, s8
-; GFX8-NEXT: buffer_load_dwordx2 v[4:5], v2, s[4:7], 0 offen offset:2048
-; GFX8-NEXT: s_add_i32 s10, s8, 0x800
+; GFX8-NEXT: v_mov_b32_e32 v1, s8
+; GFX8-NEXT: buffer_load_dword v2, v1, s[4:7], 0 offen offset:1024
+; GFX8-NEXT: s_add_i32 s10, s8, 0x400
; GFX8-NEXT: s_mov_b64 s[8:9], 0
-; GFX8-NEXT: v_mov_b32_e32 v6, s10
+; GFX8-NEXT: v_mov_b32_e32 v3, s10
; GFX8-NEXT: .LBB4_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1]
-; GFX8-NEXT: v_mov_b32_e32 v10, v5
-; GFX8-NEXT: v_mov_b32_e32 v9, v4
-; GFX8-NEXT: v_mov_b32_e32 v8, v3
-; GFX8-NEXT: v_mov_b32_e32 v7, v2
-; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[7:10], v6, s[4:7], 0 offen glc
+; GFX8-NEXT: v_add_f32_e32 v1, v2, v0
+; GFX8-NEXT: v_mov_b32_e32 v5, v2
+; GFX8-NEXT: v_mov_b32_e32 v4, v1
+; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[4:7], 0 offen glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[7:8], v[4:5]
-; GFX8-NEXT: v_mov_b32_e32 v4, v7
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v2
; GFX8-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX8-NEXT: v_mov_b32_e32 v5, v8
+; GFX8-NEXT: v_mov_b32_e32 v2, v4
; GFX8-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX8-NEXT: s_cbranch_execnz .LBB4_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX8-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset:
+; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_grained_memory:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v2, s8
-; GFX7-NEXT: buffer_load_dwordx2 v[4:5], v2, s[4:7], 0 offen offset:2048
-; GFX7-NEXT: s_add_i32 s10, s8, 0x800
+; GFX7-NEXT: v_mov_b32_e32 v1, s8
+; GFX7-NEXT: buffer_load_dword v2, v1, s[4:7], 0 offen offset:1024
+; GFX7-NEXT: s_add_i32 s10, s8, 0x400
; GFX7-NEXT: s_mov_b64 s[8:9], 0
-; GFX7-NEXT: v_mov_b32_e32 v6, s10
+; GFX7-NEXT: v_mov_b32_e32 v3, s10
; GFX7-NEXT: .LBB4_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1]
-; GFX7-NEXT: v_mov_b32_e32 v10, v5
-; GFX7-NEXT: v_mov_b32_e32 v9, v4
-; GFX7-NEXT: v_mov_b32_e32 v8, v3
-; GFX7-NEXT: v_mov_b32_e32 v7, v2
-; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[7:10], v6, s[4:7], 0 offen glc
+; GFX7-NEXT: v_add_f32_e32 v1, v2, v0
+; GFX7-NEXT: v_mov_b32_e32 v5, v2
+; GFX7-NEXT: v_mov_b32_e32 v4, v1
+; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[4:7], 0 offen glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[7:8], v[4:5]
-; GFX7-NEXT: v_mov_b32_e32 v4, v7
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v2
; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT: v_mov_b32_e32 v5, v8
+; GFX7-NEXT: v_mov_b32_e32 v2, v4
; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_cbranch_execnz .LBB4_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
-; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset:
+; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f32__offset__amdgpu_no_fine_grained_memory:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v2, s8
-; GFX6-NEXT: buffer_load_dwordx2 v[4:5], v2, s[4:7], 0 offen offset:2048
-; GFX6-NEXT: s_add_i32 s10, s8, 0x800
+; GFX6-NEXT: v_mov_b32_e32 v1, s8
+; GFX6-NEXT: buffer_load_dword v2, v1, s[4:7], 0 offen offset:1024
+; GFX6-NEXT: s_add_i32 s10, s8, 0x400
; GFX6-NEXT: s_mov_b64 s[8:9], 0
-; GFX6-NEXT: v_mov_b32_e32 v6, s10
+; GFX6-NEXT: v_mov_b32_e32 v3, s10
; GFX6-NEXT: .LBB4_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1]
+; GFX6-NEXT: v_add_f32_e32 v1, v2, v0
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v10, v5
-; GFX6-NEXT: v_mov_b32_e32 v9, v4
-; GFX6-NEXT: v_mov_b32_e32 v8, v3
-; GFX6-NEXT: v_mov_b32_e32 v7, v2
-; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[7:10], v6, s[4:7], 0 offen glc
+; GFX6-NEXT: v_mov_b32_e32 v5, v2
+; GFX6-NEXT: v_mov_b32_e32 v4, v1
+; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[4:7], 0 offen glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[7:8], v[4:5]
-; GFX6-NEXT: v_mov_b32_e32 v4, v7
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v2
; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT: v_mov_b32_e32 v5, v8
+; GFX6-NEXT: v_mov_b32_e32 v2, v4
; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_cbranch_execnz .LBB4_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
- %gep = getelementptr double, ptr addrspace(7) %ptr, i32 256
- %unused = atomicrmw fadd ptr addrspace(7) %gep, double %val syncscope("agent") seq_cst
+ %gep = getelementptr float, ptr addrspace(7) %ptr, i32 256
+ %unused = atomicrmw fadd ptr addrspace(7) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
ret void
}
-define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall(ptr addrspace(7) %ptr, double %val) #0 {
-; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall:
+define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset(ptr addrspace(7) inreg %ptr, float %val) {
+; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: s_wait_expcnt 0x0
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v7, v2
-; GFX12-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0
-; GFX12-NEXT: v_add_nc_u32_e32 v15, 0x800, v4
-; GFX12-NEXT: s_mov_b32 s1, exec_lo
-; GFX12-NEXT: .LBB5_1: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-NEXT: v_readfirstlane_b32 s4, v9
-; GFX12-NEXT: v_readfirstlane_b32 s5, v10
-; GFX12-NEXT: v_readfirstlane_b32 s6, v7
-; GFX12-NEXT: v_readfirstlane_b32 s7, v8
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10]
-; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[7:8]
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_and_saveexec_b32 s0, s0
-; GFX12-NEXT: buffer_load_b64 v[13:14], v4, s[4:7], null offen offset:2048
-; GFX12-NEXT: ; implicit-def: $vgpr4
-; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB5_1
-; GFX12-NEXT: ; %bb.2:
-; GFX12-NEXT: s_mov_b32 exec_lo, s1
-; GFX12-NEXT: s_mov_b32 s1, 0
-; GFX12-NEXT: .LBB5_3: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Loop Header: Depth=1
-; GFX12-NEXT: ; Child Loop BB5_4 Depth 2
+; GFX12-NEXT: v_mov_b32_e32 v2, v0
+; GFX12-NEXT: v_mov_b32_e32 v0, s4
+; GFX12-NEXT: s_addk_co_i32 s4, 0x400
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: v_mov_b32_e32 v3, s4
+; GFX12-NEXT: s_mov_b32 s4, 0
+; GFX12-NEXT: buffer_load_b32 v0, v0, s[0:3], null offen offset:1024
+; GFX12-NEXT: .LBB5_1: ; %atomicrmw.start
+; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_add_f64_e32 v[11:12], v[13:14], v[5:6]
-; GFX12-NEXT: s_mov_b32 s2, exec_lo
+; GFX12-NEXT: v_mov_b32_e32 v5, v0
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_dual_mov_b32 v0, v11 :: v_dual_mov_b32 v1, v12
-; GFX12-NEXT: v_dual_mov_b32 v2, v13 :: v_dual_mov_b32 v3, v14
-; GFX12-NEXT: .LBB5_4: ; Parent Loop BB5_3 Depth=1
-; GFX12-NEXT: ; => This Inner Loop Header: Depth=2
-; GFX12-NEXT: v_readfirstlane_b32 s4, v9
-; GFX12-NEXT: v_readfirstlane_b32 s5, v10
-; GFX12-NEXT: v_readfirstlane_b32 s6, v7
-; GFX12-NEXT: v_readfirstlane_b32 s7, v8
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10]
-; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[7:8]
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_and_saveexec_b32 s0, s0
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v15, s[4:7], null offen th:TH_ATOMIC_RETURN
-; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB5_4
-; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB5_3 Depth=1
-; GFX12-NEXT: s_mov_b32 exec_lo, s2
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_add_f32_e32 v4, v5, v2
+; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], null offen th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[13:14]
-; GFX12-NEXT: v_dual_mov_b32 v14, v1 :: v_dual_mov_b32 v13, v0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
+; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5
+; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT: s_cbranch_execnz .LBB5_3
-; GFX12-NEXT: ; %bb.6: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX12-NEXT: s_cbranch_execnz .LBB5_1
+; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
-; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall:
+; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_mov_b32_e32 v7, v6
-; GFX940-NEXT: v_mov_b32_e32 v6, v5
-; GFX940-NEXT: s_mov_b64 s[2:3], exec
+; GFX940-NEXT: v_mov_b32_e32 v1, s4
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: .LBB5_1: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: v_readfirstlane_b32 s4, v0
-; GFX940-NEXT: v_readfirstlane_b32 s5, v1
-; GFX940-NEXT: v_readfirstlane_b32 s6, v2
-; GFX940-NEXT: v_readfirstlane_b32 s7, v3
-; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1]
-; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
-; GFX940-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT: s_and_saveexec_b64 s[0:1], s[0:1]
-; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: buffer_atomic_add_f64 v[6:7], v4, s[4:7], 0 offen offset:2048 sc0
-; GFX940-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
-; GFX940-NEXT: ; implicit-def: $vgpr4
-; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1]
-; GFX940-NEXT: s_cbranch_execnz .LBB5_1
-; GFX940-NEXT: ; %bb.2:
-; GFX940-NEXT: s_mov_b64 exec, s[2:3]
+; GFX940-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], 0 offen offset:1024 sc0
; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: v_mov_b32_e32 v0, v6
-; GFX940-NEXT: v_mov_b32_e32 v1, v7
; GFX940-NEXT: buffer_inv sc1
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall:
+; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v7, v2
-; GFX11-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0
-; GFX11-NEXT: v_add_nc_u32_e32 v15, 0x800, v4
-; GFX11-NEXT: s_mov_b32 s1, 0
-; GFX11-NEXT: s_mov_b32 s2, exec_lo
-; GFX11-NEXT: .LBB5_1: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_readfirstlane_b32 s4, v9
-; GFX11-NEXT: v_readfirstlane_b32 s5, v10
-; GFX11-NEXT: v_readfirstlane_b32 s6, v7
-; GFX11-NEXT: v_readfirstlane_b32 s7, v8
-; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[7:8]
-; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX11-NEXT: v_mov_b32_e32 v2, v0
+; GFX11-NEXT: v_mov_b32_e32 v0, s4
+; GFX11-NEXT: s_addk_i32 s4, 0x400
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_saveexec_b32 s0, s0
-; GFX11-NEXT: buffer_load_b64 v[13:14], v4, s[4:7], 0 offen offset:2048
-; GFX11-NEXT: ; implicit-def: $vgpr4
-; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB5_1
-; GFX11-NEXT: ; %bb.2:
-; GFX11-NEXT: s_mov_b32 exec_lo, s2
-; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB5_3: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Loop Header: Depth=1
-; GFX11-NEXT: ; Child Loop BB5_4 Depth 2
+; GFX11-NEXT: v_mov_b32_e32 v3, s4
+; GFX11-NEXT: s_mov_b32 s4, 0
+; GFX11-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024
+; GFX11-NEXT: .LBB5_1: ; %atomicrmw.start
+; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_add_f64 v[11:12], v[13:14], v[5:6]
-; GFX11-NEXT: s_mov_b32 s2, exec_lo
+; GFX11-NEXT: v_mov_b32_e32 v5, v0
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_dual_mov_b32 v0, v11 :: v_dual_mov_b32 v1, v12
-; GFX11-NEXT: v_dual_mov_b32 v2, v13 :: v_dual_mov_b32 v3, v14
-; GFX11-NEXT: .LBB5_4: ; Parent Loop BB5_3 Depth=1
-; GFX11-NEXT: ; => This Inner Loop Header: Depth=2
-; GFX11-NEXT: v_readfirstlane_b32 s4, v9
-; GFX11-NEXT: v_readfirstlane_b32 s5, v10
-; GFX11-NEXT: v_readfirstlane_b32 s6, v7
-; GFX11-NEXT: v_readfirstlane_b32 s7, v8
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10]
-; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[7:8]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_and_saveexec_b32 s0, s0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_add_f32_e32 v4, v5, v2
+; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], 0 offen glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v15, s[4:7], 0 offen glc
-; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB5_4
-; GFX11-NEXT: ; %bb.5: ; in Loop: Header=BB5_3 Depth=1
-; GFX11-NEXT: s_mov_b32 exec_lo, s2
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[13:14]
-; GFX11-NEXT: v_dual_mov_b32 v14, v1 :: v_dual_mov_b32 v13, v0
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5
+; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT: s_cbranch_execnz .LBB5_3
-; GFX11-NEXT: ; %bb.6: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX11-NEXT: s_cbranch_execnz .LBB5_1
+; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
-; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall:
+; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v8, v3
-; GFX10-NEXT: v_mov_b32_e32 v7, v2
-; GFX10-NEXT: v_mov_b32_e32 v10, v1
-; GFX10-NEXT: v_mov_b32_e32 v9, v0
-; GFX10-NEXT: v_add_nc_u32_e32 v15, 0x800, v4
-; GFX10-NEXT: s_mov_b32 s5, 0
-; GFX10-NEXT: s_mov_b32 s6, exec_lo
-; GFX10-NEXT: .LBB5_1: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: v_readfirstlane_b32 s8, v9
-; GFX10-NEXT: v_readfirstlane_b32 s9, v10
-; GFX10-NEXT: v_readfirstlane_b32 s10, v7
-; GFX10-NEXT: v_readfirstlane_b32 s11, v8
-; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[8:9], v[9:10]
-; GFX10-NEXT: v_cmp_eq_u64_e64 s4, s[10:11], v[7:8]
-; GFX10-NEXT: s_and_b32 s4, vcc_lo, s4
-; GFX10-NEXT: s_and_saveexec_b32 s4, s4
-; GFX10-NEXT: buffer_load_dwordx2 v[13:14], v4, s[8:11], 0 offen offset:2048
-; GFX10-NEXT: ; implicit-def: $vgpr4
-; GFX10-NEXT: s_waitcnt_depctr 0xffe3
-; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: s_cbranch_execnz .LBB5_1
-; GFX10-NEXT: ; %bb.2:
-; GFX10-NEXT: s_mov_b32 exec_lo, s6
-; GFX10-NEXT: .LBB5_3: ; %atomicrmw.start
-; GFX10-NEXT: ; =>This Loop Header: Depth=1
-; GFX10-NEXT: ; Child Loop BB5_4 Depth 2
+; GFX10-NEXT: v_mov_b32_e32 v2, v0
+; GFX10-NEXT: v_mov_b32_e32 v0, s8
+; GFX10-NEXT: s_addk_i32 s8, 0x400
+; GFX10-NEXT: v_mov_b32_e32 v3, s8
+; GFX10-NEXT: s_mov_b32 s8, 0
+; GFX10-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024
+; GFX10-NEXT: .LBB5_1: ; %atomicrmw.start
+; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_add_f64 v[11:12], v[13:14], v[5:6]
-; GFX10-NEXT: s_mov_b32 s6, exec_lo
+; GFX10-NEXT: v_mov_b32_e32 v5, v0
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: v_mov_b32_e32 v0, v11
-; GFX10-NEXT: v_mov_b32_e32 v1, v12
-; GFX10-NEXT: v_mov_b32_e32 v2, v13
-; GFX10-NEXT: v_mov_b32_e32 v3, v14
-; GFX10-NEXT: .LBB5_4: ; Parent Loop BB5_3 Depth=1
-; GFX10-NEXT: ; => This Inner Loop Header: Depth=2
-; GFX10-NEXT: v_readfirstlane_b32 s8, v9
-; GFX10-NEXT: v_readfirstlane_b32 s9, v10
-; GFX10-NEXT: v_readfirstlane_b32 s10, v7
-; GFX10-NEXT: v_readfirstlane_b32 s11, v8
-; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[8:9], v[9:10]
-; GFX10-NEXT: v_cmp_eq_u64_e64 s4, s[10:11], v[7:8]
-; GFX10-NEXT: s_and_b32 s4, vcc_lo, s4
-; GFX10-NEXT: s_and_saveexec_b32 s4, s4
-; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v15, s[8:11], 0 offen glc
-; GFX10-NEXT: s_waitcnt_depctr 0xffe3
-; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: s_cbranch_execnz .LBB5_4
-; GFX10-NEXT: ; %bb.5: ; in Loop: Header=BB5_3 Depth=1
-; GFX10-NEXT: s_mov_b32 exec_lo, s6
+; GFX10-NEXT: v_add_f32_e32 v4, v5, v2
+; GFX10-NEXT: v_mov_b32_e32 v0, v4
+; GFX10-NEXT: v_mov_b32_e32 v1, v5
+; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[13:14]
-; GFX10-NEXT: v_mov_b32_e32 v14, v1
-; GFX10-NEXT: v_mov_b32_e32 v13, v0
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5
-; GFX10-NEXT: s_waitcnt_depctr 0xffe3
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5
-; GFX10-NEXT: s_cbranch_execnz .LBB5_3
-; GFX10-NEXT: ; %bb.6: ; %atomicrmw.end
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5
+; GFX10-NEXT: s_or_b32 s8, vcc_lo, s8
+; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s8
+; GFX10-NEXT: s_cbranch_execnz .LBB5_1
+; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s8
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall:
+; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v7, v6
-; GFX90A-NEXT: v_mov_b32_e32 v6, v5
-; GFX90A-NEXT: s_mov_b64 s[6:7], exec
-; GFX90A-NEXT: .LBB5_1: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: v_readfirstlane_b32 s8, v0
-; GFX90A-NEXT: v_readfirstlane_b32 s9, v1
-; GFX90A-NEXT: v_readfirstlane_b32 s10, v2
-; GFX90A-NEXT: v_readfirstlane_b32 s11, v3
-; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
-; GFX90A-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
-; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v2, v0
+; GFX90A-NEXT: v_mov_b32_e32 v0, s8
+; GFX90A-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024
+; GFX90A-NEXT: s_add_i32 s10, s8, 0x400
+; GFX90A-NEXT: s_mov_b64 s[8:9], 0
+; GFX90A-NEXT: v_mov_b32_e32 v3, s10
+; GFX90A-NEXT: .LBB5_1: ; %atomicrmw.start
+; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: buffer_atomic_add_f64 v[6:7], v4, s[8:11], 0 offen offset:2048 glc
-; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
-; GFX90A-NEXT: ; implicit-def: $vgpr4
-; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: s_cbranch_execnz .LBB5_1
-; GFX90A-NEXT: ; %bb.2:
-; GFX90A-NEXT: s_mov_b64 exec, s[6:7]
+; GFX90A-NEXT: v_mov_b32_e32 v5, v0
+; GFX90A-NEXT: v_add_f32_e32 v4, v5, v2
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1]
+; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v0, v6
-; GFX90A-NEXT: v_mov_b32_e32 v1, v7
; GFX90A-NEXT: buffer_wbinvl1
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
+; GFX90A-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; GFX90A-NEXT: s_andn2_b64 exec, exec, s[8:9]
+; GFX90A-NEXT: s_cbranch_execnz .LBB5_1
+; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX90A-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
-; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall:
+; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v8, v3
-; GFX908-NEXT: v_mov_b32_e32 v7, v2
-; GFX908-NEXT: v_mov_b32_e32 v10, v1
-; GFX908-NEXT: v_mov_b32_e32 v9, v0
-; GFX908-NEXT: v_add_u32_e32 v15, 0x800, v4
-; GFX908-NEXT: s_mov_b64 s[6:7], exec
-; GFX908-NEXT: .LBB5_1: ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT: v_readfirstlane_b32 s8, v9
-; GFX908-NEXT: v_readfirstlane_b32 s9, v10
-; GFX908-NEXT: v_readfirstlane_b32 s10, v7
-; GFX908-NEXT: v_readfirstlane_b32 s11, v8
-; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[9:10]
-; GFX908-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[7:8]
-; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
-; GFX908-NEXT: s_nop 0
-; GFX908-NEXT: buffer_load_dwordx2 v[13:14], v4, s[8:11], 0 offen offset:2048
-; GFX908-NEXT: ; implicit-def: $vgpr4
-; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5]
-; GFX908-NEXT: s_cbranch_execnz .LBB5_1
-; GFX908-NEXT: ; %bb.2:
-; GFX908-NEXT: s_mov_b64 exec, s[6:7]
-; GFX908-NEXT: s_mov_b64 s[6:7], 0
-; GFX908-NEXT: .LBB5_3: ; %atomicrmw.start
-; GFX908-NEXT: ; =>This Loop Header: Depth=1
-; GFX908-NEXT: ; Child Loop BB5_4 Depth 2
-; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_add_f64 v[11:12], v[13:14], v[5:6]
-; GFX908-NEXT: s_mov_b64 s[12:13], exec
-; GFX908-NEXT: v_mov_b32_e32 v0, v11
-; GFX908-NEXT: v_mov_b32_e32 v1, v12
-; GFX908-NEXT: v_mov_b32_e32 v2, v13
-; GFX908-NEXT: v_mov_b32_e32 v3, v14
-; GFX908-NEXT: .LBB5_4: ; Parent Loop BB5_3 Depth=1
-; GFX908-NEXT: ; => This Inner Loop Header: Depth=2
-; GFX908-NEXT: v_readfirstlane_b32 s8, v9
-; GFX908-NEXT: v_readfirstlane_b32 s9, v10
-; GFX908-NEXT: v_readfirstlane_b32 s10, v7
-; GFX908-NEXT: v_readfirstlane_b32 s11, v8
-; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[9:10]
-; GFX908-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[7:8]
-; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
+; GFX908-NEXT: v_mov_b32_e32 v2, v0
+; GFX908-NEXT: v_mov_b32_e32 v0, s8
+; GFX908-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024
+; GFX908-NEXT: s_add_i32 s10, s8, 0x400
+; GFX908-NEXT: s_mov_b64 s[8:9], 0
+; GFX908-NEXT: v_mov_b32_e32 v3, s10
+; GFX908-NEXT: .LBB5_1: ; %atomicrmw.start
+; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v15, s[8:11], 0 offen glc
-; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5]
-; GFX908-NEXT: s_cbranch_execnz .LBB5_4
-; GFX908-NEXT: ; %bb.5: ; in Loop: Header=BB5_3 Depth=1
-; GFX908-NEXT: s_mov_b64 exec, s[12:13]
+; GFX908-NEXT: v_mov_b32_e32 v5, v0
+; GFX908-NEXT: v_add_f32_e32 v4, v5, v2
+; GFX908-NEXT: v_mov_b32_e32 v0, v4
+; GFX908-NEXT: v_mov_b32_e32 v1, v5
+; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[13:14]
-; GFX908-NEXT: v_mov_b32_e32 v14, v1
-; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX908-NEXT: v_mov_b32_e32 v13, v0
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7]
-; GFX908-NEXT: s_cbranch_execnz .LBB5_3
-; GFX908-NEXT: ; %bb.6: ; %atomicrmw.end
-; GFX908-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
+; GFX908-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; GFX908-NEXT: s_andn2_b64 exec, exec, s[8:9]
+; GFX908-NEXT: s_cbranch_execnz .LBB5_1
+; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
-; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall:
+; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v8, v3
-; GFX8-NEXT: v_mov_b32_e32 v7, v2
-; GFX8-NEXT: v_mov_b32_e32 v10, v1
-; GFX8-NEXT: v_mov_b32_e32 v9, v0
-; GFX8-NEXT: v_add_u32_e32 v15, vcc, 0x800, v4
-; GFX8-NEXT: s_mov_b64 s[6:7], exec
-; GFX8-NEXT: .LBB5_1: ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT: v_readfirstlane_b32 s8, v9
-; GFX8-NEXT: v_readfirstlane_b32 s9, v10
-; GFX8-NEXT: v_readfirstlane_b32 s10, v7
-; GFX8-NEXT: v_readfirstlane_b32 s11, v8
-; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[9:10]
-; GFX8-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[7:8]
-; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
-; GFX8-NEXT: s_nop 0
-; GFX8-NEXT: buffer_load_dwordx2 v[13:14], v4, s[8:11], 0 offen offset:2048
-; GFX8-NEXT: ; implicit-def: $vgpr4
-; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5]
-; GFX8-NEXT: s_cbranch_execnz .LBB5_1
-; GFX8-NEXT: ; %bb.2:
-; GFX8-NEXT: s_mov_b64 exec, s[6:7]
-; GFX8-NEXT: s_mov_b64 s[6:7], 0
-; GFX8-NEXT: .LBB5_3: ; %atomicrmw.start
-; GFX8-NEXT: ; =>This Loop Header: Depth=1
-; GFX8-NEXT: ; Child Loop BB5_4 Depth 2
-; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_add_f64 v[11:12], v[13:14], v[5:6]
-; GFX8-NEXT: s_mov_b64 s[12:13], exec
-; GFX8-NEXT: v_mov_b32_e32 v0, v11
-; GFX8-NEXT: v_mov_b32_e32 v1, v12
-; GFX8-NEXT: v_mov_b32_e32 v2, v13
-; GFX8-NEXT: v_mov_b32_e32 v3, v14
-; GFX8-NEXT: .LBB5_4: ; Parent Loop BB5_3 Depth=1
-; GFX8-NEXT: ; => This Inner Loop Header: Depth=2
-; GFX8-NEXT: v_readfirstlane_b32 s8, v9
-; GFX8-NEXT: v_readfirstlane_b32 s9, v10
-; GFX8-NEXT: v_readfirstlane_b32 s10, v7
-; GFX8-NEXT: v_readfirstlane_b32 s11, v8
-; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[9:10]
-; GFX8-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[7:8]
-; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
+; GFX8-NEXT: v_mov_b32_e32 v2, v0
+; GFX8-NEXT: v_mov_b32_e32 v0, s8
+; GFX8-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024
+; GFX8-NEXT: s_add_i32 s10, s8, 0x400
+; GFX8-NEXT: s_mov_b64 s[8:9], 0
+; GFX8-NEXT: v_mov_b32_e32 v3, s10
+; GFX8-NEXT: .LBB5_1: ; %atomicrmw.start
+; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v15, s[8:11], 0 offen glc
-; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5]
-; GFX8-NEXT: s_cbranch_execnz .LBB5_4
-; GFX8-NEXT: ; %bb.5: ; in Loop: Header=BB5_3 Depth=1
-; GFX8-NEXT: s_mov_b64 exec, s[12:13]
+; GFX8-NEXT: v_mov_b32_e32 v5, v0
+; GFX8-NEXT: v_add_f32_e32 v4, v5, v2
+; GFX8-NEXT: v_mov_b32_e32 v0, v4
+; GFX8-NEXT: v_mov_b32_e32 v1, v5
+; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[13:14]
-; GFX8-NEXT: v_mov_b32_e32 v14, v1
-; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX8-NEXT: v_mov_b32_e32 v13, v0
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7]
-; GFX8-NEXT: s_cbranch_execnz .LBB5_3
-; GFX8-NEXT: ; %bb.6: ; %atomicrmw.end
-; GFX8-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX8-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall:
-; GFX7: ; %bb.0:
-; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v8, v3
-; GFX7-NEXT: v_mov_b32_e32 v7, v2
-; GFX7-NEXT: v_mov_b32_e32 v10, v1
-; GFX7-NEXT: v_mov_b32_e32 v9, v0
-; GFX7-NEXT: v_add_i32_e32 v15, vcc, 0x800, v4
-; GFX7-NEXT: s_mov_b64 s[6:7], exec
-; GFX7-NEXT: .LBB5_1: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: v_readfirstlane_b32 s8, v9
-; GFX7-NEXT: v_readfirstlane_b32 s9, v10
-; GFX7-NEXT: v_readfirstlane_b32 s10, v7
-; GFX7-NEXT: v_readfirstlane_b32 s11, v8
-; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[9:10]
-; GFX7-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[7:8]
-; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
-; GFX7-NEXT: buffer_load_dwordx2 v[13:14], v4, s[8:11], 0 offen offset:2048
-; GFX7-NEXT: ; implicit-def: $vgpr4
-; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5]
-; GFX7-NEXT: s_cbranch_execnz .LBB5_1
-; GFX7-NEXT: ; %bb.2:
-; GFX7-NEXT: s_mov_b64 exec, s[6:7]
-; GFX7-NEXT: s_mov_b64 s[6:7], 0
-; GFX7-NEXT: .LBB5_3: ; %atomicrmw.start
-; GFX7-NEXT: ; =>This Loop Header: Depth=1
-; GFX7-NEXT: ; Child Loop BB5_4 Depth 2
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_add_f64 v[11:12], v[13:14], v[5:6]
-; GFX7-NEXT: s_mov_b64 s[12:13], exec
-; GFX7-NEXT: v_mov_b32_e32 v0, v11
-; GFX7-NEXT: v_mov_b32_e32 v1, v12
-; GFX7-NEXT: v_mov_b32_e32 v2, v13
-; GFX7-NEXT: v_mov_b32_e32 v3, v14
-; GFX7-NEXT: .LBB5_4: ; Parent Loop BB5_3 Depth=1
-; GFX7-NEXT: ; => This Inner Loop Header: Depth=2
-; GFX7-NEXT: v_readfirstlane_b32 s8, v9
-; GFX7-NEXT: v_readfirstlane_b32 s9, v10
-; GFX7-NEXT: v_readfirstlane_b32 s10, v7
-; GFX7-NEXT: v_readfirstlane_b32 s11, v8
-; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[9:10]
-; GFX7-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[7:8]
-; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
+; GFX8-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; GFX8-NEXT: s_andn2_b64 exec, exec, s[8:9]
+; GFX8-NEXT: s_cbranch_execnz .LBB5_1
+; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT: s_or_b64 exec, exec, s[8:9]
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v2, v0
+; GFX7-NEXT: v_mov_b32_e32 v0, s8
+; GFX7-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024
+; GFX7-NEXT: s_add_i32 s10, s8, 0x400
+; GFX7-NEXT: s_mov_b64 s[8:9], 0
+; GFX7-NEXT: v_mov_b32_e32 v3, s10
+; GFX7-NEXT: .LBB5_1: ; %atomicrmw.start
+; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v15, s[8:11], 0 offen glc
-; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5]
-; GFX7-NEXT: s_cbranch_execnz .LBB5_4
-; GFX7-NEXT: ; %bb.5: ; in Loop: Header=BB5_3 Depth=1
-; GFX7-NEXT: s_mov_b64 exec, s[12:13]
+; GFX7-NEXT: v_mov_b32_e32 v5, v0
+; GFX7-NEXT: v_add_f32_e32 v4, v5, v2
+; GFX7-NEXT: v_mov_b32_e32 v0, v4
+; GFX7-NEXT: v_mov_b32_e32 v1, v5
+; GFX7-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[13:14]
-; GFX7-NEXT: v_mov_b32_e32 v14, v1
-; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX7-NEXT: v_mov_b32_e32 v13, v0
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7]
-; GFX7-NEXT: s_cbranch_execnz .LBB5_3
-; GFX7-NEXT: ; %bb.6: ; %atomicrmw.end
-; GFX7-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
+; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
+; GFX7-NEXT: s_cbranch_execnz .LBB5_1
+; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
-; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall:
+; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v8, v3
-; GFX6-NEXT: v_mov_b32_e32 v7, v2
-; GFX6-NEXT: v_mov_b32_e32 v10, v1
-; GFX6-NEXT: v_mov_b32_e32 v9, v0
-; GFX6-NEXT: v_add_i32_e32 v15, vcc, 0x800, v4
-; GFX6-NEXT: s_mov_b64 s[6:7], exec
-; GFX6-NEXT: .LBB5_1: ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT: v_readfirstlane_b32 s8, v9
-; GFX6-NEXT: v_readfirstlane_b32 s9, v10
-; GFX6-NEXT: v_readfirstlane_b32 s10, v7
-; GFX6-NEXT: v_readfirstlane_b32 s11, v8
-; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[9:10]
-; GFX6-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[7:8]
-; GFX6-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
-; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
-; GFX6-NEXT: buffer_load_dwordx2 v[13:14], v4, s[8:11], 0 offen offset:2048
-; GFX6-NEXT: ; implicit-def: $vgpr4
-; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5]
-; GFX6-NEXT: s_cbranch_execnz .LBB5_1
-; GFX6-NEXT: ; %bb.2:
-; GFX6-NEXT: s_mov_b64 exec, s[6:7]
-; GFX6-NEXT: s_mov_b64 s[6:7], 0
-; GFX6-NEXT: .LBB5_3: ; %atomicrmw.start
-; GFX6-NEXT: ; =>This Loop Header: Depth=1
-; GFX6-NEXT: ; Child Loop BB5_4 Depth 2
+; GFX6-NEXT: v_mov_b32_e32 v2, v0
+; GFX6-NEXT: v_mov_b32_e32 v0, s8
+; GFX6-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024
+; GFX6-NEXT: s_add_i32 s10, s8, 0x400
+; GFX6-NEXT: s_mov_b64 s[8:9], 0
+; GFX6-NEXT: v_mov_b32_e32 v3, s10
+; GFX6-NEXT: .LBB5_1: ; %atomicrmw.start
+; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_add_f64 v[11:12], v[13:14], v[5:6]
-; GFX6-NEXT: s_mov_b64 s[12:13], exec
+; GFX6-NEXT: v_mov_b32_e32 v5, v0
+; GFX6-NEXT: v_add_f32_e32 v4, v5, v2
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v0, v11
-; GFX6-NEXT: v_mov_b32_e32 v1, v12
-; GFX6-NEXT: v_mov_b32_e32 v2, v13
-; GFX6-NEXT: v_mov_b32_e32 v3, v14
-; GFX6-NEXT: .LBB5_4: ; Parent Loop BB5_3 Depth=1
-; GFX6-NEXT: ; => This Inner Loop Header: Depth=2
-; GFX6-NEXT: v_readfirstlane_b32 s8, v9
-; GFX6-NEXT: v_readfirstlane_b32 s9, v10
-; GFX6-NEXT: v_readfirstlane_b32 s10, v7
-; GFX6-NEXT: v_readfirstlane_b32 s11, v8
-; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[9:10]
-; GFX6-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[7:8]
-; GFX6-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
-; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
-; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0)
-; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v15, s[8:11], 0 offen glc
-; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5]
-; GFX6-NEXT: s_cbranch_execnz .LBB5_4
-; GFX6-NEXT: ; %bb.5: ; in Loop: Header=BB5_3 Depth=1
-; GFX6-NEXT: s_mov_b64 exec, s[12:13]
+; GFX6-NEXT: v_mov_b32_e32 v0, v4
+; GFX6-NEXT: v_mov_b32_e32 v1, v5
+; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[13:14]
-; GFX6-NEXT: v_mov_b32_e32 v14, v1
-; GFX6-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX6-NEXT: v_mov_b32_e32 v13, v0
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: s_andn2_b64 exec, exec, s[6:7]
-; GFX6-NEXT: s_cbranch_execnz .LBB5_3
-; GFX6-NEXT: ; %bb.6: ; %atomicrmw.end
-; GFX6-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
+; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
+; GFX6-NEXT: s_cbranch_execnz .LBB5_1
+; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
- %gep = getelementptr double, ptr addrspace(7) %ptr, i32 256
- %result = atomicrmw fadd ptr addrspace(7) %gep, double %val syncscope("agent") seq_cst
- ret double %result
+ %gep = getelementptr float, ptr addrspace(7) %ptr, i32 256
+ %result = atomicrmw fadd ptr addrspace(7) %gep, float %val syncscope("agent") seq_cst
+ ret float %result
}
-; --------------------------------------------------------------------
-; half
-; --------------------------------------------------------------------
-
-define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset(ptr addrspace(7) inreg %ptr, half %val) #0 {
-; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset:
+define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote_memory(ptr addrspace(7) inreg %ptr, float %val) #0 {
+; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote_memory:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: s_wait_expcnt 0x0
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_addk_co_i32 s4, 0x200
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_b32 s5, s4, -4
-; GFX12-NEXT: s_and_b32 s4, s4, 3
-; GFX12-NEXT: v_mov_b32_e32 v5, s5
-; GFX12-NEXT: s_lshl_b32 s4, s4, 3
-; GFX12-NEXT: s_lshl_b32 s5, 0xffff, s4
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_not_b32 s6, s5
-; GFX12-NEXT: buffer_load_b32 v2, v5, s[0:3], null offen
-; GFX12-NEXT: s_mov_b32 s5, 0
-; GFX12-NEXT: .LBB6_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_lshrrev_b32_e32 v1, s4, v2
+; GFX12-NEXT: v_mov_b32_e32 v1, s4
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_add_f16_e32 v1, v1, v0
-; GFX12-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e32 v1, s4, v1
-; GFX12-NEXT: v_and_or_b32 v1, v2, s6, v1
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1
-; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2
-; GFX12-NEXT: v_mov_b32_e32 v2, v3
-; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
-; GFX12-NEXT: s_cbranch_execnz .LBB6_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5
-; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v3
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
-; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset:
+; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote_memory:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: s_addk_i32 s4, 0x200
-; GFX940-NEXT: s_and_b32 s5, s4, -4
-; GFX940-NEXT: v_mov_b32_e32 v1, s5
-; GFX940-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen
-; GFX940-NEXT: s_and_b32 s4, s4, 3
-; GFX940-NEXT: s_lshl_b32 s6, s4, 3
-; GFX940-NEXT: s_lshl_b32 s4, 0xffff, s6
-; GFX940-NEXT: s_not_b32 s7, s4
-; GFX940-NEXT: s_mov_b64 s[4:5], 0
-; GFX940-NEXT: .LBB6_1: ; %atomicrmw.start
-; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: v_lshrrev_b32_e32 v2, s6, v3
-; GFX940-NEXT: v_add_f16_e32 v2, v2, v0
-; GFX940-NEXT: v_lshlrev_b32_e32 v2, s6, v2
-; GFX940-NEXT: v_and_or_b32 v2, v3, s7, v2
-; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[2:3]
+; GFX940-NEXT: v_mov_b32_e32 v1, s4
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v1, s[0:3], 0 offen sc0
+; GFX940-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], 0 offen offset:1024 sc0
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
-; GFX940-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX940-NEXT: v_mov_b32_e32 v3, v4
-; GFX940-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX940-NEXT: s_cbranch_execnz .LBB6_1
-; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX940-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX940-NEXT: v_lshrrev_b32_e32 v0, s6, v4
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset:
+; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: s_addk_i32 s4, 0x200
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_b32 s5, s4, -4
-; GFX11-NEXT: s_and_b32 s4, s4, 3
-; GFX11-NEXT: v_mov_b32_e32 v5, s5
-; GFX11-NEXT: s_lshl_b32 s4, s4, 3
-; GFX11-NEXT: s_lshl_b32 s5, 0xffff, s4
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_not_b32 s6, s5
-; GFX11-NEXT: buffer_load_b32 v2, v5, s[0:3], 0 offen
-; GFX11-NEXT: s_mov_b32 s5, 0
-; GFX11-NEXT: .LBB6_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v1, s4, v2
+; GFX11-NEXT: v_mov_b32_e32 v1, s4
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_f16_e32 v1, v1, v0
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v1, s4, v1
-; GFX11-NEXT: v_and_or_b32 v1, v2, s6, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1
-; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], 0 offen glc
+; GFX11-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], 0 offen offset:1024 glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2
-; GFX11-NEXT: v_mov_b32_e32 v2, v3
-; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
-; GFX11-NEXT: s_cbranch_execnz .LBB6_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s5
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, s4, v3
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
-; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset:
+; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote_memory:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: s_addk_i32 s8, 0x200
-; GFX10-NEXT: s_and_b32 s9, s8, -4
-; GFX10-NEXT: s_and_b32 s8, s8, 3
-; GFX10-NEXT: v_mov_b32_e32 v5, s9
-; GFX10-NEXT: s_lshl_b32 s8, s8, 3
-; GFX10-NEXT: s_lshl_b32 s9, 0xffff, s8
-; GFX10-NEXT: s_not_b32 s10, s9
-; GFX10-NEXT: buffer_load_dword v2, v5, s[4:7], 0 offen
-; GFX10-NEXT: s_mov_b32 s9, 0
-; GFX10-NEXT: .LBB6_1: ; %atomicrmw.start
-; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT: v_mov_b32_e32 v2, v0
+; GFX10-NEXT: v_mov_b32_e32 v0, s8
+; GFX10-NEXT: s_addk_i32 s8, 0x400
+; GFX10-NEXT: v_mov_b32_e32 v3, s8
+; GFX10-NEXT: s_mov_b32 s8, 0
+; GFX10-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024
+; GFX10-NEXT: .LBB6_1: ; %atomicrmw.start
+; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_lshrrev_b32_e32 v1, s8, v2
+; GFX10-NEXT: v_mov_b32_e32 v5, v0
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: v_add_f16_e32 v1, v1, v0
-; GFX10-NEXT: v_lshlrev_b32_sdwa v1, s8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-; GFX10-NEXT: v_and_or_b32 v1, v2, s10, v1
-; GFX10-NEXT: v_mov_b32_e32 v4, v2
-; GFX10-NEXT: v_mov_b32_e32 v3, v1
-; GFX10-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[4:7], 0 offen glc
+; GFX10-NEXT: v_add_f32_e32 v4, v5, v2
+; GFX10-NEXT: v_mov_b32_e32 v0, v4
+; GFX10-NEXT: v_mov_b32_e32 v1, v5
+; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2
-; GFX10-NEXT: v_mov_b32_e32 v2, v3
-; GFX10-NEXT: s_or_b32 s9, vcc_lo, s9
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s9
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5
+; GFX10-NEXT: s_or_b32 s8, vcc_lo, s8
+; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s8
; GFX10-NEXT: s_cbranch_execnz .LBB6_1
; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s9
-; GFX10-NEXT: v_lshrrev_b32_e32 v0, s8, v3
+; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s8
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset:
+; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: s_addk_i32 s8, 0x200
-; GFX90A-NEXT: s_and_b32 s9, s8, -4
-; GFX90A-NEXT: v_mov_b32_e32 v1, s9
-; GFX90A-NEXT: buffer_load_dword v3, v1, s[4:7], 0 offen
-; GFX90A-NEXT: s_and_b32 s8, s8, 3
-; GFX90A-NEXT: s_lshl_b32 s10, s8, 3
-; GFX90A-NEXT: s_lshl_b32 s8, 0xffff, s10
-; GFX90A-NEXT: s_not_b32 s11, s8
-; GFX90A-NEXT: s_mov_b64 s[8:9], 0
-; GFX90A-NEXT: .LBB6_1: ; %atomicrmw.start
-; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_lshrrev_b32_e32 v2, s10, v3
-; GFX90A-NEXT: v_add_f16_e32 v2, v2, v0
-; GFX90A-NEXT: v_lshlrev_b32_e32 v2, s10, v2
-; GFX90A-NEXT: v_and_or_b32 v2, v3, s11, v2
-; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
-; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v1, s[4:7], 0 offen glc
+; GFX90A-NEXT: v_mov_b32_e32 v1, s8
+; GFX90A-NEXT: buffer_atomic_add_f32 v0, v1, s[4:7], 0 offen offset:1024 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
-; GFX90A-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX90A-NEXT: v_mov_b32_e32 v3, v4
-; GFX90A-NEXT: s_andn2_b64 exec, exec, s[8:9]
-; GFX90A-NEXT: s_cbranch_execnz .LBB6_1
-; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX90A-NEXT: s_or_b64 exec, exec, s[8:9]
-; GFX90A-NEXT: v_lshrrev_b32_e32 v0, s10, v4
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
-; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset:
+; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: s_addk_i32 s8, 0x200
-; GFX908-NEXT: s_and_b32 s9, s8, -4
-; GFX908-NEXT: v_mov_b32_e32 v5, s9
-; GFX908-NEXT: buffer_load_dword v2, v5, s[4:7], 0 offen
-; GFX908-NEXT: s_and_b32 s8, s8, 3
-; GFX908-NEXT: s_lshl_b32 s10, s8, 3
-; GFX908-NEXT: s_lshl_b32 s8, 0xffff, s10
-; GFX908-NEXT: s_not_b32 s11, s8
+; GFX908-NEXT: v_mov_b32_e32 v2, v0
+; GFX908-NEXT: v_mov_b32_e32 v0, s8
+; GFX908-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024
+; GFX908-NEXT: s_add_i32 s10, s8, 0x400
; GFX908-NEXT: s_mov_b64 s[8:9], 0
+; GFX908-NEXT: v_mov_b32_e32 v3, s10
; GFX908-NEXT: .LBB6_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_lshrrev_b32_e32 v1, s10, v2
-; GFX908-NEXT: v_add_f16_e32 v1, v1, v0
-; GFX908-NEXT: v_lshlrev_b32_e32 v1, s10, v1
-; GFX908-NEXT: v_and_or_b32 v1, v2, s11, v1
-; GFX908-NEXT: v_mov_b32_e32 v4, v2
-; GFX908-NEXT: v_mov_b32_e32 v3, v1
-; GFX908-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[4:7], 0 offen glc
+; GFX908-NEXT: v_mov_b32_e32 v5, v0
+; GFX908-NEXT: v_add_f32_e32 v4, v5, v2
+; GFX908-NEXT: v_mov_b32_e32 v0, v4
+; GFX908-NEXT: v_mov_b32_e32 v1, v5
+; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
; GFX908-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX908-NEXT: v_mov_b32_e32 v2, v3
; GFX908-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX908-NEXT: s_cbranch_execnz .LBB6_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX908-NEXT: s_or_b64 exec, exec, s[8:9]
-; GFX908-NEXT: v_lshrrev_b32_e32 v0, s10, v3
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
-; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset:
+; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote_memory:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: s_addk_i32 s8, 0x200
-; GFX8-NEXT: s_and_b32 s9, s8, -4
-; GFX8-NEXT: v_mov_b32_e32 v5, s9
-; GFX8-NEXT: buffer_load_dword v2, v5, s[4:7], 0 offen
-; GFX8-NEXT: s_and_b32 s8, s8, 3
-; GFX8-NEXT: s_lshl_b32 s10, s8, 3
-; GFX8-NEXT: s_lshl_b32 s8, 0xffff, s10
-; GFX8-NEXT: s_not_b32 s11, s8
+; GFX8-NEXT: v_mov_b32_e32 v2, v0
+; GFX8-NEXT: v_mov_b32_e32 v0, s8
+; GFX8-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024
+; GFX8-NEXT: s_add_i32 s10, s8, 0x400
; GFX8-NEXT: s_mov_b64 s[8:9], 0
+; GFX8-NEXT: v_mov_b32_e32 v3, s10
; GFX8-NEXT: .LBB6_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_lshrrev_b32_e32 v1, s10, v2
-; GFX8-NEXT: v_add_f16_e32 v1, v1, v0
-; GFX8-NEXT: v_and_b32_e32 v3, s11, v2
-; GFX8-NEXT: v_lshlrev_b32_e32 v1, s10, v1
-; GFX8-NEXT: v_or_b32_e32 v1, v3, v1
-; GFX8-NEXT: v_mov_b32_e32 v4, v2
-; GFX8-NEXT: v_mov_b32_e32 v3, v1
-; GFX8-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[4:7], 0 offen glc
+; GFX8-NEXT: v_mov_b32_e32 v5, v0
+; GFX8-NEXT: v_add_f32_e32 v4, v5, v2
+; GFX8-NEXT: v_mov_b32_e32 v0, v4
+; GFX8-NEXT: v_mov_b32_e32 v1, v5
+; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
; GFX8-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX8-NEXT: v_mov_b32_e32 v2, v3
; GFX8-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX8-NEXT: s_cbranch_execnz .LBB6_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX8-NEXT: s_or_b64 exec, exec, s[8:9]
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, s10, v3
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset:
+; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote_memory:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: s_addk_i32 s8, 0x200
-; GFX7-NEXT: s_and_b32 s9, s8, -4
-; GFX7-NEXT: v_mov_b32_e32 v4, s9
-; GFX7-NEXT: buffer_load_dword v1, v4, s[4:7], 0 offen
-; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT: s_and_b32 s8, s8, 3
-; GFX7-NEXT: s_lshl_b32 s10, s8, 3
-; GFX7-NEXT: s_lshl_b32 s8, 0xffff, s10
-; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v0
-; GFX7-NEXT: s_not_b32 s11, s8
+; GFX7-NEXT: v_mov_b32_e32 v2, v0
+; GFX7-NEXT: v_mov_b32_e32 v0, s8
+; GFX7-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024
+; GFX7-NEXT: s_add_i32 s10, s8, 0x400
; GFX7-NEXT: s_mov_b64 s[8:9], 0
+; GFX7-NEXT: v_mov_b32_e32 v3, s10
; GFX7-NEXT: .LBB6_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, s10, v1
-; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX7-NEXT: v_and_b32_e32 v2, s11, v1
-; GFX7-NEXT: v_add_f32_e32 v0, v0, v5
-; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT: v_lshlrev_b32_e32 v0, s10, v0
-; GFX7-NEXT: v_or_b32_e32 v0, v2, v0
-; GFX7-NEXT: v_mov_b32_e32 v3, v1
-; GFX7-NEXT: v_mov_b32_e32 v2, v0
-; GFX7-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[4:7], 0 offen glc
+; GFX7-NEXT: v_mov_b32_e32 v5, v0
+; GFX7-NEXT: v_add_f32_e32 v4, v5, v2
+; GFX7-NEXT: v_mov_b32_e32 v0, v4
+; GFX7-NEXT: v_mov_b32_e32 v1, v5
+; GFX7-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT: v_mov_b32_e32 v1, v2
; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_cbranch_execnz .LBB6_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, s10, v2
-; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
-; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset:
+; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote_memory:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT: s_addk_i32 s8, 0x200
-; GFX6-NEXT: s_and_b32 s9, s8, -4
-; GFX6-NEXT: v_mov_b32_e32 v4, s9
-; GFX6-NEXT: buffer_load_dword v1, v4, s[4:7], 0 offen
-; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX6-NEXT: s_and_b32 s8, s8, 3
-; GFX6-NEXT: s_lshl_b32 s10, s8, 3
-; GFX6-NEXT: s_lshl_b32 s8, 0xffff, s10
-; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v0
-; GFX6-NEXT: s_not_b32 s11, s8
+; GFX6-NEXT: v_mov_b32_e32 v2, v0
+; GFX6-NEXT: v_mov_b32_e32 v0, s8
+; GFX6-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024
+; GFX6-NEXT: s_add_i32 s10, s8, 0x400
; GFX6-NEXT: s_mov_b64 s[8:9], 0
+; GFX6-NEXT: v_mov_b32_e32 v3, s10
; GFX6-NEXT: .LBB6_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_lshrrev_b32_e32 v0, s10, v1
-; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX6-NEXT: v_mov_b32_e32 v5, v0
+; GFX6-NEXT: v_add_f32_e32 v4, v5, v2
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v2, s11, v1
-; GFX6-NEXT: v_add_f32_e32 v0, v0, v5
-; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX6-NEXT: v_lshlrev_b32_e32 v0, s10, v0
-; GFX6-NEXT: v_or_b32_e32 v0, v2, v0
-; GFX6-NEXT: v_mov_b32_e32 v3, v1
-; GFX6-NEXT: v_mov_b32_e32 v2, v0
-; GFX6-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[4:7], 0 offen glc
+; GFX6-NEXT: v_mov_b32_e32 v0, v4
+; GFX6-NEXT: v_mov_b32_e32 v1, v5
+; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT: v_mov_b32_e32 v1, v2
; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_cbranch_execnz .LBB6_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
-; GFX6-NEXT: v_lshrrev_b32_e32 v0, s10, v2
-; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
- %gep = getelementptr half, ptr addrspace(7) %ptr, i32 256
- %result = atomicrmw fadd ptr addrspace(7) %gep, half %val syncscope("agent") seq_cst
- ret half %result
+ %gep = getelementptr float, ptr addrspace(7) %ptr, i32 256
+ %result = atomicrmw fadd ptr addrspace(7) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
+ ret float %result
}
-define void @buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset(ptr addrspace(7) inreg %ptr, half %val) #0 {
-; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset:
+define float @buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode(ptr addrspace(7) inreg %ptr, float %val) #0 {
+; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: s_wait_expcnt 0x0
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_addk_co_i32 s4, 0x200
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_b32 s5, s4, -4
-; GFX12-NEXT: s_and_b32 s4, s4, 3
-; GFX12-NEXT: v_mov_b32_e32 v3, s5
-; GFX12-NEXT: s_lshl_b32 s4, s4, 3
-; GFX12-NEXT: s_lshl_b32 s5, 0xffff, s4
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_not_b32 s6, s5
-; GFX12-NEXT: buffer_load_b32 v2, v3, s[0:3], null offen
-; GFX12-NEXT: s_mov_b32 s5, 0
-; GFX12-NEXT: .LBB7_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_lshrrev_b32_e32 v1, s4, v2
+; GFX12-NEXT: v_mov_b32_e32 v1, s4
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_add_f16_e32 v1, v1, v0
-; GFX12-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e32 v1, s4, v1
-; GFX12-NEXT: v_and_or_b32 v1, v2, s6, v1
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_mov_b32 v4, v1
-; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2
-; GFX12-NEXT: v_mov_b32_e32 v2, v4
-; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
-; GFX12-NEXT: s_cbranch_execnz .LBB7_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
-; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset:
+; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: s_addk_i32 s4, 0x200
-; GFX940-NEXT: s_and_b32 s5, s4, -4
-; GFX940-NEXT: v_mov_b32_e32 v1, s5
-; GFX940-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen
-; GFX940-NEXT: s_and_b32 s4, s4, 3
-; GFX940-NEXT: s_lshl_b32 s6, s4, 3
-; GFX940-NEXT: s_lshl_b32 s4, 0xffff, s6
-; GFX940-NEXT: s_not_b32 s7, s4
-; GFX940-NEXT: s_mov_b64 s[4:5], 0
-; GFX940-NEXT: .LBB7_1: ; %atomicrmw.start
-; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: v_lshrrev_b32_e32 v2, s6, v3
-; GFX940-NEXT: v_add_f16_e32 v2, v2, v0
-; GFX940-NEXT: v_lshlrev_b32_e32 v2, s6, v2
-; GFX940-NEXT: v_and_or_b32 v2, v3, s7, v2
-; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[2:3]
+; GFX940-NEXT: v_mov_b32_e32 v1, s4
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v1, s[0:3], 0 offen sc0
+; GFX940-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], 0 offen offset:1024 sc0
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
-; GFX940-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX940-NEXT: v_mov_b32_e32 v3, v4
-; GFX940-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX940-NEXT: s_cbranch_execnz .LBB7_1
-; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX940-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset:
+; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: s_addk_i32 s4, 0x200
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_b32 s5, s4, -4
-; GFX11-NEXT: s_and_b32 s4, s4, 3
-; GFX11-NEXT: v_mov_b32_e32 v3, s5
-; GFX11-NEXT: s_lshl_b32 s4, s4, 3
-; GFX11-NEXT: s_lshl_b32 s5, 0xffff, s4
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_not_b32 s6, s5
-; GFX11-NEXT: buffer_load_b32 v2, v3, s[0:3], 0 offen
-; GFX11-NEXT: s_mov_b32 s5, 0
-; GFX11-NEXT: .LBB7_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v1, s4, v2
+; GFX11-NEXT: v_mov_b32_e32 v1, s4
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_f16_e32 v1, v1, v0
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v1, s4, v1
-; GFX11-NEXT: v_and_or_b32 v1, v2, s6, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_mov_b32 v4, v1
-; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], 0 offen glc
+; GFX11-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], 0 offen offset:1024 glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2
-; GFX11-NEXT: v_mov_b32_e32 v2, v4
-; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
-; GFX11-NEXT: s_cbranch_execnz .LBB7_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s5
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
-; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset:
+; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: s_addk_i32 s8, 0x200
-; GFX10-NEXT: s_and_b32 s9, s8, -4
-; GFX10-NEXT: s_and_b32 s8, s8, 3
-; GFX10-NEXT: v_mov_b32_e32 v3, s9
-; GFX10-NEXT: s_lshl_b32 s8, s8, 3
-; GFX10-NEXT: s_lshl_b32 s9, 0xffff, s8
-; GFX10-NEXT: s_not_b32 s10, s9
-; GFX10-NEXT: buffer_load_dword v2, v3, s[4:7], 0 offen
-; GFX10-NEXT: s_mov_b32 s9, 0
+; GFX10-NEXT: v_mov_b32_e32 v2, v0
+; GFX10-NEXT: v_mov_b32_e32 v0, s8
+; GFX10-NEXT: s_addk_i32 s8, 0x400
+; GFX10-NEXT: v_mov_b32_e32 v3, s8
+; GFX10-NEXT: s_mov_b32 s8, 0
+; GFX10-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024
; GFX10-NEXT: .LBB7_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_lshrrev_b32_e32 v1, s8, v2
+; GFX10-NEXT: v_mov_b32_e32 v5, v0
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: v_add_f16_e32 v1, v1, v0
-; GFX10-NEXT: v_lshlrev_b32_sdwa v1, s8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-; GFX10-NEXT: v_and_or_b32 v1, v2, s10, v1
-; GFX10-NEXT: v_mov_b32_e32 v5, v2
-; GFX10-NEXT: v_mov_b32_e32 v4, v1
-; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[4:7], 0 offen glc
+; GFX10-NEXT: v_add_f32_e32 v4, v5, v2
+; GFX10-NEXT: v_mov_b32_e32 v0, v4
+; GFX10-NEXT: v_mov_b32_e32 v1, v5
+; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2
-; GFX10-NEXT: v_mov_b32_e32 v2, v4
-; GFX10-NEXT: s_or_b32 s9, vcc_lo, s9
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s9
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5
+; GFX10-NEXT: s_or_b32 s8, vcc_lo, s8
+; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s8
; GFX10-NEXT: s_cbranch_execnz .LBB7_1
; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s9
+; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s8
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset:
+; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: s_addk_i32 s8, 0x200
-; GFX90A-NEXT: s_and_b32 s9, s8, -4
-; GFX90A-NEXT: v_mov_b32_e32 v1, s9
-; GFX90A-NEXT: buffer_load_dword v3, v1, s[4:7], 0 offen
-; GFX90A-NEXT: s_and_b32 s8, s8, 3
-; GFX90A-NEXT: s_lshl_b32 s10, s8, 3
-; GFX90A-NEXT: s_lshl_b32 s8, 0xffff, s10
-; GFX90A-NEXT: s_not_b32 s11, s8
-; GFX90A-NEXT: s_mov_b64 s[8:9], 0
-; GFX90A-NEXT: .LBB7_1: ; %atomicrmw.start
-; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_lshrrev_b32_e32 v2, s10, v3
-; GFX90A-NEXT: v_add_f16_e32 v2, v2, v0
-; GFX90A-NEXT: v_lshlrev_b32_e32 v2, s10, v2
-; GFX90A-NEXT: v_and_or_b32 v2, v3, s11, v2
-; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
-; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v1, s[4:7], 0 offen glc
+; GFX90A-NEXT: v_mov_b32_e32 v1, s8
+; GFX90A-NEXT: buffer_atomic_add_f32 v0, v1, s[4:7], 0 offen offset:1024 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
-; GFX90A-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX90A-NEXT: v_mov_b32_e32 v3, v4
-; GFX90A-NEXT: s_andn2_b64 exec, exec, s[8:9]
-; GFX90A-NEXT: s_cbranch_execnz .LBB7_1
-; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX90A-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
-; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset:
+; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: s_addk_i32 s8, 0x200
-; GFX908-NEXT: s_and_b32 s9, s8, -4
-; GFX908-NEXT: v_mov_b32_e32 v3, s9
-; GFX908-NEXT: buffer_load_dword v2, v3, s[4:7], 0 offen
-; GFX908-NEXT: s_and_b32 s8, s8, 3
-; GFX908-NEXT: s_lshl_b32 s10, s8, 3
-; GFX908-NEXT: s_lshl_b32 s8, 0xffff, s10
-; GFX908-NEXT: s_not_b32 s11, s8
+; GFX908-NEXT: v_mov_b32_e32 v2, v0
+; GFX908-NEXT: v_mov_b32_e32 v0, s8
+; GFX908-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024
+; GFX908-NEXT: s_add_i32 s10, s8, 0x400
; GFX908-NEXT: s_mov_b64 s[8:9], 0
+; GFX908-NEXT: v_mov_b32_e32 v3, s10
; GFX908-NEXT: .LBB7_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_lshrrev_b32_e32 v1, s10, v2
-; GFX908-NEXT: v_add_f16_e32 v1, v1, v0
-; GFX908-NEXT: v_lshlrev_b32_e32 v1, s10, v1
-; GFX908-NEXT: v_and_or_b32 v1, v2, s11, v1
-; GFX908-NEXT: v_mov_b32_e32 v5, v2
-; GFX908-NEXT: v_mov_b32_e32 v4, v1
-; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[4:7], 0 offen glc
+; GFX908-NEXT: v_mov_b32_e32 v5, v0
+; GFX908-NEXT: v_add_f32_e32 v4, v5, v2
+; GFX908-NEXT: v_mov_b32_e32 v0, v4
+; GFX908-NEXT: v_mov_b32_e32 v1, v5
+; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v2
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
; GFX908-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX908-NEXT: v_mov_b32_e32 v2, v4
; GFX908-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX908-NEXT: s_cbranch_execnz .LBB7_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX908-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
-; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset:
+; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: s_addk_i32 s8, 0x200
-; GFX8-NEXT: s_and_b32 s9, s8, -4
-; GFX8-NEXT: v_mov_b32_e32 v3, s9
-; GFX8-NEXT: buffer_load_dword v2, v3, s[4:7], 0 offen
-; GFX8-NEXT: s_and_b32 s8, s8, 3
-; GFX8-NEXT: s_lshl_b32 s10, s8, 3
-; GFX8-NEXT: s_lshl_b32 s8, 0xffff, s10
-; GFX8-NEXT: s_not_b32 s11, s8
+; GFX8-NEXT: v_mov_b32_e32 v2, v0
+; GFX8-NEXT: v_mov_b32_e32 v0, s8
+; GFX8-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024
+; GFX8-NEXT: s_add_i32 s10, s8, 0x400
; GFX8-NEXT: s_mov_b64 s[8:9], 0
+; GFX8-NEXT: v_mov_b32_e32 v3, s10
; GFX8-NEXT: .LBB7_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_lshrrev_b32_e32 v1, s10, v2
-; GFX8-NEXT: v_add_f16_e32 v1, v1, v0
-; GFX8-NEXT: v_and_b32_e32 v4, s11, v2
-; GFX8-NEXT: v_lshlrev_b32_e32 v1, s10, v1
-; GFX8-NEXT: v_or_b32_e32 v1, v4, v1
-; GFX8-NEXT: v_mov_b32_e32 v5, v2
-; GFX8-NEXT: v_mov_b32_e32 v4, v1
-; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[4:7], 0 offen glc
+; GFX8-NEXT: v_mov_b32_e32 v5, v0
+; GFX8-NEXT: v_add_f32_e32 v4, v5, v2
+; GFX8-NEXT: v_mov_b32_e32 v0, v4
+; GFX8-NEXT: v_mov_b32_e32 v1, v5
+; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v2
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
; GFX8-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX8-NEXT: v_mov_b32_e32 v2, v4
; GFX8-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX8-NEXT: s_cbranch_execnz .LBB7_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX8-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset:
+; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: s_addk_i32 s8, 0x200
-; GFX7-NEXT: s_and_b32 s9, s8, -4
-; GFX7-NEXT: v_mov_b32_e32 v2, s9
-; GFX7-NEXT: buffer_load_dword v1, v2, s[4:7], 0 offen
-; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT: s_and_b32 s8, s8, 3
-; GFX7-NEXT: s_lshl_b32 s10, s8, 3
-; GFX7-NEXT: s_lshl_b32 s8, 0xffff, s10
-; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v0
-; GFX7-NEXT: s_not_b32 s11, s8
+; GFX7-NEXT: v_mov_b32_e32 v2, v0
+; GFX7-NEXT: v_mov_b32_e32 v0, s8
+; GFX7-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024
+; GFX7-NEXT: s_add_i32 s10, s8, 0x400
; GFX7-NEXT: s_mov_b64 s[8:9], 0
+; GFX7-NEXT: v_mov_b32_e32 v3, s10
; GFX7-NEXT: .LBB7_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, s10, v1
-; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX7-NEXT: v_and_b32_e32 v4, s11, v1
-; GFX7-NEXT: v_add_f32_e32 v0, v0, v3
-; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT: v_lshlrev_b32_e32 v0, s10, v0
-; GFX7-NEXT: v_or_b32_e32 v0, v4, v0
-; GFX7-NEXT: v_mov_b32_e32 v5, v1
-; GFX7-NEXT: v_mov_b32_e32 v4, v0
-; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[4:7], 0 offen glc
+; GFX7-NEXT: v_mov_b32_e32 v5, v0
+; GFX7-NEXT: v_add_f32_e32 v4, v5, v2
+; GFX7-NEXT: v_mov_b32_e32 v0, v4
+; GFX7-NEXT: v_mov_b32_e32 v1, v5
+; GFX7-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT: v_mov_b32_e32 v1, v4
; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_cbranch_execnz .LBB7_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
-; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset:
+; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f32__offset__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT: s_addk_i32 s8, 0x200
-; GFX6-NEXT: s_and_b32 s9, s8, -4
-; GFX6-NEXT: v_mov_b32_e32 v2, s9
-; GFX6-NEXT: buffer_load_dword v1, v2, s[4:7], 0 offen
-; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX6-NEXT: s_and_b32 s8, s8, 3
-; GFX6-NEXT: s_lshl_b32 s10, s8, 3
-; GFX6-NEXT: s_lshl_b32 s8, 0xffff, s10
-; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v0
-; GFX6-NEXT: s_not_b32 s11, s8
+; GFX6-NEXT: v_mov_b32_e32 v2, v0
+; GFX6-NEXT: v_mov_b32_e32 v0, s8
+; GFX6-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024
+; GFX6-NEXT: s_add_i32 s10, s8, 0x400
; GFX6-NEXT: s_mov_b64 s[8:9], 0
+; GFX6-NEXT: v_mov_b32_e32 v3, s10
; GFX6-NEXT: .LBB7_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_lshrrev_b32_e32 v0, s10, v1
-; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX6-NEXT: v_mov_b32_e32 v5, v0
+; GFX6-NEXT: v_add_f32_e32 v4, v5, v2
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v4, s11, v1
-; GFX6-NEXT: v_add_f32_e32 v0, v0, v3
-; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX6-NEXT: v_lshlrev_b32_e32 v0, s10, v0
-; GFX6-NEXT: v_or_b32_e32 v0, v4, v0
-; GFX6-NEXT: v_mov_b32_e32 v5, v1
-; GFX6-NEXT: v_mov_b32_e32 v4, v0
-; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[4:7], 0 offen glc
+; GFX6-NEXT: v_mov_b32_e32 v0, v4
+; GFX6-NEXT: v_mov_b32_e32 v1, v5
+; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT: v_mov_b32_e32 v1, v4
; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_cbranch_execnz .LBB7_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
- %gep = getelementptr half, ptr addrspace(7) %ptr, i32 256
- %unused = atomicrmw fadd ptr addrspace(7) %gep, half %val syncscope("agent") seq_cst
- ret void
+ %gep = getelementptr float, ptr addrspace(7) %ptr, i32 256
+ %result = atomicrmw fadd ptr addrspace(7) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0
+ ret float %result
}
-define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall(ptr addrspace(7) %ptr, half %val) #0 {
-; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall:
+; --------------------------------------------------------------------
+; double
+; --------------------------------------------------------------------
+
+define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, double %val) #0 {
+; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_grained_memory:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: s_wait_expcnt 0x0
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_add_nc_u32_e32 v6, 0x200, v4
-; GFX12-NEXT: s_mov_b32 s1, exec_lo
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_and_b32_e32 v4, 3, v6
-; GFX12-NEXT: v_and_b32_e32 v10, -4, v6
-; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e64 v7, v4, 0xffff
-; GFX12-NEXT: v_not_b32_e32 v11, v7
-; GFX12-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: v_readfirstlane_b32 s4, v0
-; GFX12-NEXT: v_readfirstlane_b32 s5, v1
-; GFX12-NEXT: v_readfirstlane_b32 s6, v2
-; GFX12-NEXT: v_readfirstlane_b32 s7, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
-; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_and_saveexec_b32 s0, s0
-; GFX12-NEXT: buffer_load_b32 v7, v10, s[4:7], null offen
-; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB8_1
-; GFX12-NEXT: ; %bb.2:
-; GFX12-NEXT: s_mov_b32 exec_lo, s1
-; GFX12-NEXT: s_mov_b32 s1, 0
-; GFX12-NEXT: .LBB8_3: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Loop Header: Depth=1
-; GFX12-NEXT: ; Child Loop BB8_4 Depth 2
+; GFX12-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
+; GFX12-NEXT: v_mov_b32_e32 v0, s4
+; GFX12-NEXT: s_addk_co_i32 s4, 0x800
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: v_mov_b32_e32 v6, s4
+; GFX12-NEXT: s_mov_b32 s4, 0
+; GFX12-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], null offen offset:2048
+; GFX12-NEXT: .LBB8_1: ; %atomicrmw.start
+; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_lshrrev_b32_e32 v6, v4, v7
-; GFX12-NEXT: s_mov_b32 s2, exec_lo
+; GFX12-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_add_f16_e32 v6, v6, v5
-; GFX12-NEXT: v_and_b32_e32 v6, 0xffff, v6
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e32 v6, v4, v6
-; GFX12-NEXT: v_and_or_b32 v6, v7, v11, v6
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_dual_mov_b32 v9, v7 :: v_dual_mov_b32 v8, v6
-; GFX12-NEXT: .LBB8_4: ; Parent Loop BB8_3 Depth=1
-; GFX12-NEXT: ; => This Inner Loop Header: Depth=2
-; GFX12-NEXT: v_readfirstlane_b32 s4, v0
-; GFX12-NEXT: v_readfirstlane_b32 s5, v1
-; GFX12-NEXT: v_readfirstlane_b32 s6, v2
-; GFX12-NEXT: v_readfirstlane_b32 s7, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
-; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_and_saveexec_b32 s0, s0
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[8:9], v10, s[4:7], null offen th:TH_ATOMIC_RETURN
-; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB8_4
-; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB8_3 Depth=1
-; GFX12-NEXT: s_mov_b32 exec_lo, s2
+; GFX12-NEXT: v_add_f64_e32 v[7:8], v[9:10], v[4:5]
+; GFX12-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
+; GFX12-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
+; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], null offen th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v8, v7
-; GFX12-NEXT: v_mov_b32_e32 v7, v8
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
+; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10]
+; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT: s_cbranch_execnz .LBB8_3
-; GFX12-NEXT: ; %bb.6: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT: v_lshrrev_b32_e32 v0, v4, v8
+; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX12-NEXT: s_cbranch_execnz .LBB8_1
+; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
-; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall:
+; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_grained_memory:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_add_u32_e32 v4, 0x200, v4
-; GFX940-NEXT: v_and_b32_e32 v10, -4, v4
-; GFX940-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX940-NEXT: v_lshlrev_b32_e32 v4, 3, v4
-; GFX940-NEXT: s_mov_b32 s0, 0xffff
-; GFX940-NEXT: v_lshlrev_b32_e64 v6, v4, s0
-; GFX940-NEXT: v_not_b32_e32 v11, v6
-; GFX940-NEXT: s_mov_b64 s[2:3], exec
-; GFX940-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: v_readfirstlane_b32 s4, v0
-; GFX940-NEXT: v_readfirstlane_b32 s5, v1
-; GFX940-NEXT: v_readfirstlane_b32 s6, v2
-; GFX940-NEXT: v_readfirstlane_b32 s7, v3
-; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1]
-; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
-; GFX940-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT: s_and_saveexec_b64 s[0:1], s[0:1]
-; GFX940-NEXT: buffer_load_dword v7, v10, s[4:7], 0 offen
-; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1]
-; GFX940-NEXT: s_cbranch_execnz .LBB8_1
-; GFX940-NEXT: ; %bb.2:
-; GFX940-NEXT: s_mov_b64 exec, s[2:3]
-; GFX940-NEXT: s_mov_b64 s[2:3], 0
-; GFX940-NEXT: .LBB8_3: ; %atomicrmw.start
-; GFX940-NEXT: ; =>This Loop Header: Depth=1
-; GFX940-NEXT: ; Child Loop BB8_4 Depth 2
-; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: v_lshrrev_b32_e32 v6, v4, v7
-; GFX940-NEXT: v_add_f16_e32 v6, v6, v5
-; GFX940-NEXT: v_lshlrev_b32_e32 v6, v4, v6
-; GFX940-NEXT: v_and_or_b32 v6, v7, v11, v6
-; GFX940-NEXT: s_mov_b64 s[8:9], exec
-; GFX940-NEXT: v_mov_b64_e32 v[8:9], v[6:7]
+; GFX940-NEXT: v_mov_b32_e32 v2, s4
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: .LBB8_4: ; Parent Loop BB8_3 Depth=1
-; GFX940-NEXT: ; => This Inner Loop Header: Depth=2
-; GFX940-NEXT: v_readfirstlane_b32 s4, v0
-; GFX940-NEXT: v_readfirstlane_b32 s5, v1
-; GFX940-NEXT: v_readfirstlane_b32 s6, v2
-; GFX940-NEXT: v_readfirstlane_b32 s7, v3
-; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1]
-; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
-; GFX940-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT: s_and_saveexec_b64 s[0:1], s[0:1]
-; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: buffer_atomic_cmpswap v[8:9], v10, s[4:7], 0 offen sc0
-; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1]
-; GFX940-NEXT: s_cbranch_execnz .LBB8_4
-; GFX940-NEXT: ; %bb.5: ; in Loop: Header=BB8_3 Depth=1
-; GFX940-NEXT: s_mov_b64 exec, s[8:9]
+; GFX940-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 offen offset:2048 sc0
; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7
-; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX940-NEXT: v_mov_b32_e32 v7, v8
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3]
-; GFX940-NEXT: s_cbranch_execnz .LBB8_3
-; GFX940-NEXT: ; %bb.6: ; %atomicrmw.end
-; GFX940-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX940-NEXT: v_lshrrev_b32_e32 v0, v4, v8
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall:
+; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_grained_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_add_nc_u32_e32 v6, 0x200, v4
-; GFX11-NEXT: s_mov_b32 s1, 0
-; GFX11-NEXT: s_mov_b32 s2, exec_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_and_b32_e32 v4, 3, v6
-; GFX11-NEXT: v_and_b32_e32 v10, -4, v6
-; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e64 v7, v4, 0xffff
-; GFX11-NEXT: v_not_b32_e32 v11, v7
-; GFX11-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: v_readfirstlane_b32 s4, v0
-; GFX11-NEXT: v_readfirstlane_b32 s5, v1
-; GFX11-NEXT: v_readfirstlane_b32 s6, v2
-; GFX11-NEXT: v_readfirstlane_b32 s7, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
-; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_and_saveexec_b32 s0, s0
-; GFX11-NEXT: buffer_load_b32 v7, v10, s[4:7], 0 offen
-; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB8_1
-; GFX11-NEXT: ; %bb.2:
-; GFX11-NEXT: s_mov_b32 exec_lo, s2
-; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB8_3: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Loop Header: Depth=1
-; GFX11-NEXT: ; Child Loop BB8_4 Depth 2
+; GFX11-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
+; GFX11-NEXT: v_mov_b32_e32 v0, s4
+; GFX11-NEXT: s_addk_i32 s4, 0x800
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: v_mov_b32_e32 v6, s4
+; GFX11-NEXT: s_mov_b32 s4, 0
+; GFX11-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], 0 offen offset:2048
+; GFX11-NEXT: .LBB8_1: ; %atomicrmw.start
+; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v6, v4, v7
-; GFX11-NEXT: s_mov_b32 s2, exec_lo
+; GFX11-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_f16_e32 v6, v6, v5
-; GFX11-NEXT: v_and_b32_e32 v6, 0xffff, v6
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v6, v4, v6
-; GFX11-NEXT: v_and_or_b32 v6, v7, v11, v6
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_dual_mov_b32 v9, v7 :: v_dual_mov_b32 v8, v6
-; GFX11-NEXT: .LBB8_4: ; Parent Loop BB8_3 Depth=1
-; GFX11-NEXT: ; => This Inner Loop Header: Depth=2
-; GFX11-NEXT: v_readfirstlane_b32 s4, v0
-; GFX11-NEXT: v_readfirstlane_b32 s5, v1
-; GFX11-NEXT: v_readfirstlane_b32 s6, v2
-; GFX11-NEXT: v_readfirstlane_b32 s7, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
-; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_and_saveexec_b32 s0, s0
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[8:9], v10, s[4:7], 0 offen glc
-; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB8_4
-; GFX11-NEXT: ; %bb.5: ; in Loop: Header=BB8_3 Depth=1
-; GFX11-NEXT: s_mov_b32 exec_lo, s2
+; GFX11-NEXT: v_add_f64 v[7:8], v[9:10], v[4:5]
+; GFX11-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
+; GFX11-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
+; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], 0 offen glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v8, v7
-; GFX11-NEXT: v_mov_b32_e32 v7, v8
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
+; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10]
+; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT: s_cbranch_execnz .LBB8_3
-; GFX11-NEXT: ; %bb.6: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, v4, v8
+; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX11-NEXT: s_cbranch_execnz .LBB8_1
+; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
-; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall:
+; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_add_nc_u32_e32 v6, 0x200, v4
-; GFX10-NEXT: s_mov_b32 s5, 0
-; GFX10-NEXT: s_mov_b32 s6, exec_lo
-; GFX10-NEXT: v_and_b32_e32 v4, 3, v6
-; GFX10-NEXT: v_and_b32_e32 v10, -4, v6
-; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v4
-; GFX10-NEXT: v_lshlrev_b32_e64 v7, v4, 0xffff
-; GFX10-NEXT: v_not_b32_e32 v11, v7
-; GFX10-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: v_readfirstlane_b32 s8, v0
-; GFX10-NEXT: v_readfirstlane_b32 s9, v1
-; GFX10-NEXT: v_readfirstlane_b32 s10, v2
-; GFX10-NEXT: v_readfirstlane_b32 s11, v3
-; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[8:9], v[0:1]
-; GFX10-NEXT: v_cmp_eq_u64_e64 s4, s[10:11], v[2:3]
-; GFX10-NEXT: s_and_b32 s4, vcc_lo, s4
-; GFX10-NEXT: s_and_saveexec_b32 s4, s4
-; GFX10-NEXT: buffer_load_dword v7, v10, s[8:11], 0 offen
-; GFX10-NEXT: s_waitcnt_depctr 0xffe3
-; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: s_cbranch_execnz .LBB8_1
-; GFX10-NEXT: ; %bb.2:
-; GFX10-NEXT: s_mov_b32 exec_lo, s6
-; GFX10-NEXT: .LBB8_3: ; %atomicrmw.start
-; GFX10-NEXT: ; =>This Loop Header: Depth=1
-; GFX10-NEXT: ; Child Loop BB8_4 Depth 2
+; GFX10-NEXT: v_mov_b32_e32 v4, v0
+; GFX10-NEXT: v_mov_b32_e32 v0, s8
+; GFX10-NEXT: v_mov_b32_e32 v5, v1
+; GFX10-NEXT: s_addk_i32 s8, 0x800
+; GFX10-NEXT: v_mov_b32_e32 v6, s8
+; GFX10-NEXT: buffer_load_dwordx2 v[0:1], v0, s[4:7], 0 offen offset:2048
+; GFX10-NEXT: s_mov_b32 s8, 0
+; GFX10-NEXT: .LBB8_1: ; %atomicrmw.start
+; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_lshrrev_b32_e32 v6, v4, v7
-; GFX10-NEXT: s_mov_b32 s6, exec_lo
+; GFX10-NEXT: v_mov_b32_e32 v10, v1
+; GFX10-NEXT: v_mov_b32_e32 v9, v0
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: v_add_f16_e32 v6, v6, v5
-; GFX10-NEXT: v_lshlrev_b32_sdwa v6, v4, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-; GFX10-NEXT: v_and_or_b32 v6, v7, v11, v6
-; GFX10-NEXT: v_mov_b32_e32 v9, v7
-; GFX10-NEXT: v_mov_b32_e32 v8, v6
-; GFX10-NEXT: .LBB8_4: ; Parent Loop BB8_3 Depth=1
-; GFX10-NEXT: ; => This Inner Loop Header: Depth=2
-; GFX10-NEXT: v_readfirstlane_b32 s8, v0
-; GFX10-NEXT: v_readfirstlane_b32 s9, v1
-; GFX10-NEXT: v_readfirstlane_b32 s10, v2
-; GFX10-NEXT: v_readfirstlane_b32 s11, v3
-; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[8:9], v[0:1]
-; GFX10-NEXT: v_cmp_eq_u64_e64 s4, s[10:11], v[2:3]
-; GFX10-NEXT: s_and_b32 s4, vcc_lo, s4
-; GFX10-NEXT: s_and_saveexec_b32 s4, s4
-; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: buffer_atomic_cmpswap v[8:9], v10, s[8:11], 0 offen glc
-; GFX10-NEXT: s_waitcnt_depctr 0xffe3
-; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: s_cbranch_execnz .LBB8_4
-; GFX10-NEXT: ; %bb.5: ; in Loop: Header=BB8_3 Depth=1
-; GFX10-NEXT: s_mov_b32 exec_lo, s6
+; GFX10-NEXT: v_add_f64 v[7:8], v[9:10], v[4:5]
+; GFX10-NEXT: v_mov_b32_e32 v0, v7
+; GFX10-NEXT: v_mov_b32_e32 v1, v8
+; GFX10-NEXT: v_mov_b32_e32 v2, v9
+; GFX10-NEXT: v_mov_b32_e32 v3, v10
+; GFX10-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[4:7], 0 offen glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v8, v7
-; GFX10-NEXT: v_mov_b32_e32 v7, v8
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5
-; GFX10-NEXT: s_waitcnt_depctr 0xffe3
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5
-; GFX10-NEXT: s_cbranch_execnz .LBB8_3
-; GFX10-NEXT: ; %bb.6: ; %atomicrmw.end
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5
-; GFX10-NEXT: v_lshrrev_b32_e32 v0, v4, v8
+; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10]
+; GFX10-NEXT: s_or_b32 s8, vcc_lo, s8
+; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s8
+; GFX10-NEXT: s_cbranch_execnz .LBB8_1
+; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s8
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall:
+; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_add_u32_e32 v4, 0x200, v4
-; GFX90A-NEXT: v_and_b32_e32 v10, -4, v4
-; GFX90A-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 3, v4
-; GFX90A-NEXT: s_mov_b32 s4, 0xffff
-; GFX90A-NEXT: v_lshlrev_b32_e64 v6, v4, s4
-; GFX90A-NEXT: v_not_b32_e32 v11, v6
-; GFX90A-NEXT: s_mov_b64 s[6:7], exec
-; GFX90A-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: v_readfirstlane_b32 s8, v0
-; GFX90A-NEXT: v_readfirstlane_b32 s9, v1
-; GFX90A-NEXT: v_readfirstlane_b32 s10, v2
-; GFX90A-NEXT: v_readfirstlane_b32 s11, v3
-; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
-; GFX90A-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
-; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
-; GFX90A-NEXT: s_nop 0
-; GFX90A-NEXT: buffer_load_dword v7, v10, s[8:11], 0 offen
-; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: s_cbranch_execnz .LBB8_1
-; GFX90A-NEXT: ; %bb.2:
-; GFX90A-NEXT: s_mov_b64 exec, s[6:7]
-; GFX90A-NEXT: s_mov_b64 s[6:7], 0
-; GFX90A-NEXT: .LBB8_3: ; %atomicrmw.start
-; GFX90A-NEXT: ; =>This Loop Header: Depth=1
-; GFX90A-NEXT: ; Child Loop BB8_4 Depth 2
-; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_lshrrev_b32_e32 v6, v4, v7
-; GFX90A-NEXT: v_add_f16_e32 v6, v6, v5
-; GFX90A-NEXT: v_lshlrev_b32_e32 v6, v4, v6
-; GFX90A-NEXT: v_and_or_b32 v6, v7, v11, v6
-; GFX90A-NEXT: s_mov_b64 s[12:13], exec
-; GFX90A-NEXT: v_pk_mov_b32 v[8:9], v[6:7], v[6:7] op_sel:[0,1]
-; GFX90A-NEXT: .LBB8_4: ; Parent Loop BB8_3 Depth=1
-; GFX90A-NEXT: ; => This Inner Loop Header: Depth=2
-; GFX90A-NEXT: v_readfirstlane_b32 s8, v0
-; GFX90A-NEXT: v_readfirstlane_b32 s9, v1
-; GFX90A-NEXT: v_readfirstlane_b32 s10, v2
-; GFX90A-NEXT: v_readfirstlane_b32 s11, v3
-; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
-; GFX90A-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
-; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
-; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: buffer_atomic_cmpswap v[8:9], v10, s[8:11], 0 offen glc
-; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: s_cbranch_execnz .LBB8_4
-; GFX90A-NEXT: ; %bb.5: ; in Loop: Header=BB8_3 Depth=1
-; GFX90A-NEXT: s_mov_b64 exec, s[12:13]
+; GFX90A-NEXT: v_mov_b32_e32 v2, s8
+; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[4:7], 0 offen offset:2048 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7
-; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX90A-NEXT: v_mov_b32_e32 v7, v8
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
-; GFX90A-NEXT: s_cbranch_execnz .LBB8_3
-; GFX90A-NEXT: ; %bb.6: ; %atomicrmw.end
-; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v4, v8
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
-; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall:
+; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_grained_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_add_u32_e32 v4, 0x200, v4
-; GFX908-NEXT: v_and_b32_e32 v10, -4, v4
-; GFX908-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX908-NEXT: v_lshlrev_b32_e32 v4, 3, v4
-; GFX908-NEXT: s_mov_b32 s4, 0xffff
-; GFX908-NEXT: v_lshlrev_b32_e64 v6, v4, s4
-; GFX908-NEXT: v_not_b32_e32 v11, v6
-; GFX908-NEXT: s_mov_b64 s[6:7], exec
-; GFX908-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT: v_readfirstlane_b32 s8, v0
-; GFX908-NEXT: v_readfirstlane_b32 s9, v1
-; GFX908-NEXT: v_readfirstlane_b32 s10, v2
-; GFX908-NEXT: v_readfirstlane_b32 s11, v3
-; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
-; GFX908-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
-; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
-; GFX908-NEXT: s_nop 0
-; GFX908-NEXT: buffer_load_dword v7, v10, s[8:11], 0 offen
-; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5]
-; GFX908-NEXT: s_cbranch_execnz .LBB8_1
-; GFX908-NEXT: ; %bb.2:
-; GFX908-NEXT: s_mov_b64 exec, s[6:7]
-; GFX908-NEXT: s_mov_b64 s[6:7], 0
-; GFX908-NEXT: .LBB8_3: ; %atomicrmw.start
-; GFX908-NEXT: ; =>This Loop Header: Depth=1
-; GFX908-NEXT: ; Child Loop BB8_4 Depth 2
-; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_lshrrev_b32_e32 v6, v4, v7
-; GFX908-NEXT: v_add_f16_e32 v6, v6, v5
-; GFX908-NEXT: v_lshlrev_b32_e32 v6, v4, v6
-; GFX908-NEXT: v_and_or_b32 v6, v7, v11, v6
-; GFX908-NEXT: v_mov_b32_e32 v9, v7
-; GFX908-NEXT: s_mov_b64 s[12:13], exec
-; GFX908-NEXT: v_mov_b32_e32 v8, v6
-; GFX908-NEXT: .LBB8_4: ; Parent Loop BB8_3 Depth=1
-; GFX908-NEXT: ; => This Inner Loop Header: Depth=2
-; GFX908-NEXT: v_readfirstlane_b32 s8, v0
-; GFX908-NEXT: v_readfirstlane_b32 s9, v1
-; GFX908-NEXT: v_readfirstlane_b32 s10, v2
-; GFX908-NEXT: v_readfirstlane_b32 s11, v3
-; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
-; GFX908-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
-; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
+; GFX908-NEXT: v_mov_b32_e32 v4, v0
+; GFX908-NEXT: v_mov_b32_e32 v0, s8
+; GFX908-NEXT: v_mov_b32_e32 v5, v1
+; GFX908-NEXT: buffer_load_dwordx2 v[0:1], v0, s[4:7], 0 offen offset:2048
+; GFX908-NEXT: s_add_i32 s10, s8, 0x800
+; GFX908-NEXT: s_mov_b64 s[8:9], 0
+; GFX908-NEXT: v_mov_b32_e32 v6, s10
+; GFX908-NEXT: .LBB8_1: ; %atomicrmw.start
+; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: buffer_atomic_cmpswap v[8:9], v10, s[8:11], 0 offen glc
-; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5]
-; GFX908-NEXT: s_cbranch_execnz .LBB8_4
-; GFX908-NEXT: ; %bb.5: ; in Loop: Header=BB8_3 Depth=1
-; GFX908-NEXT: s_mov_b64 exec, s[12:13]
+; GFX908-NEXT: v_mov_b32_e32 v10, v1
+; GFX908-NEXT: v_mov_b32_e32 v9, v0
+; GFX908-NEXT: v_add_f64 v[7:8], v[9:10], v[4:5]
+; GFX908-NEXT: v_mov_b32_e32 v0, v7
+; GFX908-NEXT: v_mov_b32_e32 v1, v8
+; GFX908-NEXT: v_mov_b32_e32 v2, v9
+; GFX908-NEXT: v_mov_b32_e32 v3, v10
+; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[4:7], 0 offen glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7
-; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX908-NEXT: v_mov_b32_e32 v7, v8
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7]
-; GFX908-NEXT: s_cbranch_execnz .LBB8_3
-; GFX908-NEXT: ; %bb.6: ; %atomicrmw.end
-; GFX908-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX908-NEXT: v_lshrrev_b32_e32 v0, v4, v8
+; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
+; GFX908-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; GFX908-NEXT: s_andn2_b64 exec, exec, s[8:9]
+; GFX908-NEXT: s_cbranch_execnz .LBB8_1
+; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
-; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall:
+; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_grained_memory:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x200, v4
-; GFX8-NEXT: v_and_b32_e32 v10, -4, v4
-; GFX8-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX8-NEXT: v_lshlrev_b32_e32 v4, 3, v4
-; GFX8-NEXT: s_mov_b32 s4, 0xffff
-; GFX8-NEXT: v_lshlrev_b32_e64 v6, v4, s4
-; GFX8-NEXT: v_not_b32_e32 v11, v6
-; GFX8-NEXT: s_mov_b64 s[6:7], exec
-; GFX8-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT: v_readfirstlane_b32 s8, v0
-; GFX8-NEXT: v_readfirstlane_b32 s9, v1
-; GFX8-NEXT: v_readfirstlane_b32 s10, v2
-; GFX8-NEXT: v_readfirstlane_b32 s11, v3
-; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
-; GFX8-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
-; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
-; GFX8-NEXT: s_nop 0
-; GFX8-NEXT: buffer_load_dword v7, v10, s[8:11], 0 offen
-; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5]
-; GFX8-NEXT: s_cbranch_execnz .LBB8_1
-; GFX8-NEXT: ; %bb.2:
-; GFX8-NEXT: s_mov_b64 exec, s[6:7]
-; GFX8-NEXT: s_mov_b64 s[6:7], 0
-; GFX8-NEXT: .LBB8_3: ; %atomicrmw.start
-; GFX8-NEXT: ; =>This Loop Header: Depth=1
-; GFX8-NEXT: ; Child Loop BB8_4 Depth 2
-; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_lshrrev_b32_e32 v6, v4, v7
-; GFX8-NEXT: v_add_f16_e32 v6, v6, v5
-; GFX8-NEXT: v_lshlrev_b32_e32 v6, v4, v6
-; GFX8-NEXT: v_and_b32_e32 v8, v7, v11
-; GFX8-NEXT: v_or_b32_e32 v6, v8, v6
-; GFX8-NEXT: v_mov_b32_e32 v9, v7
-; GFX8-NEXT: s_mov_b64 s[12:13], exec
-; GFX8-NEXT: v_mov_b32_e32 v8, v6
-; GFX8-NEXT: .LBB8_4: ; Parent Loop BB8_3 Depth=1
-; GFX8-NEXT: ; => This Inner Loop Header: Depth=2
-; GFX8-NEXT: v_readfirstlane_b32 s8, v0
-; GFX8-NEXT: v_readfirstlane_b32 s9, v1
-; GFX8-NEXT: v_readfirstlane_b32 s10, v2
-; GFX8-NEXT: v_readfirstlane_b32 s11, v3
-; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
-; GFX8-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
-; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
+; GFX8-NEXT: v_mov_b32_e32 v4, v0
+; GFX8-NEXT: v_mov_b32_e32 v0, s8
+; GFX8-NEXT: v_mov_b32_e32 v5, v1
+; GFX8-NEXT: buffer_load_dwordx2 v[0:1], v0, s[4:7], 0 offen offset:2048
+; GFX8-NEXT: s_add_i32 s10, s8, 0x800
+; GFX8-NEXT: s_mov_b64 s[8:9], 0
+; GFX8-NEXT: v_mov_b32_e32 v6, s10
+; GFX8-NEXT: .LBB8_1: ; %atomicrmw.start
+; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: buffer_atomic_cmpswap v[8:9], v10, s[8:11], 0 offen glc
-; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5]
-; GFX8-NEXT: s_cbranch_execnz .LBB8_4
-; GFX8-NEXT: ; %bb.5: ; in Loop: Header=BB8_3 Depth=1
-; GFX8-NEXT: s_mov_b64 exec, s[12:13]
+; GFX8-NEXT: v_mov_b32_e32 v10, v1
+; GFX8-NEXT: v_mov_b32_e32 v9, v0
+; GFX8-NEXT: v_add_f64 v[7:8], v[9:10], v[4:5]
+; GFX8-NEXT: v_mov_b32_e32 v0, v7
+; GFX8-NEXT: v_mov_b32_e32 v1, v8
+; GFX8-NEXT: v_mov_b32_e32 v2, v9
+; GFX8-NEXT: v_mov_b32_e32 v3, v10
+; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[4:7], 0 offen glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7
-; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX8-NEXT: v_mov_b32_e32 v7, v8
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7]
-; GFX8-NEXT: s_cbranch_execnz .LBB8_3
-; GFX8-NEXT: ; %bb.6: ; %atomicrmw.end
-; GFX8-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, v4, v8
+; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
+; GFX8-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; GFX8-NEXT: s_andn2_b64 exec, exec, s[8:9]
+; GFX8-NEXT: s_cbranch_execnz .LBB8_1
+; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall:
+; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_grained_memory:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x200, v4
-; GFX7-NEXT: v_and_b32_e32 v8, -4, v4
-; GFX7-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX7-NEXT: v_lshlrev_b32_e32 v7, 3, v4
-; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v7
-; GFX7-NEXT: v_not_b32_e32 v9, v4
-; GFX7-NEXT: s_mov_b64 s[6:7], exec
-; GFX7-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: v_readfirstlane_b32 s8, v0
-; GFX7-NEXT: v_readfirstlane_b32 s9, v1
-; GFX7-NEXT: v_readfirstlane_b32 s10, v2
-; GFX7-NEXT: v_readfirstlane_b32 s11, v3
-; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
-; GFX7-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
-; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
-; GFX7-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen
-; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5]
-; GFX7-NEXT: s_cbranch_execnz .LBB8_1
-; GFX7-NEXT: ; %bb.2:
-; GFX7-NEXT: s_mov_b64 exec, s[6:7]
-; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v5
-; GFX7-NEXT: s_mov_b64 s[6:7], 0
-; GFX7-NEXT: v_cvt_f32_f16_e32 v10, v4
-; GFX7-NEXT: .LBB8_3: ; %atomicrmw.start
-; GFX7-NEXT: ; =>This Loop Header: Depth=1
-; GFX7-NEXT: ; Child Loop BB8_4 Depth 2
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v4, v7, v6
-; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4
-; GFX7-NEXT: v_and_b32_e32 v5, v6, v9
-; GFX7-NEXT: s_mov_b64 s[12:13], exec
-; GFX7-NEXT: v_add_f32_e32 v4, v4, v10
-; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4
-; GFX7-NEXT: v_lshlrev_b32_e32 v4, v7, v4
-; GFX7-NEXT: v_or_b32_e32 v5, v5, v4
-; GFX7-NEXT: v_mov_b32_e32 v4, v5
-; GFX7-NEXT: v_mov_b32_e32 v5, v6
-; GFX7-NEXT: .LBB8_4: ; Parent Loop BB8_3 Depth=1
-; GFX7-NEXT: ; => This Inner Loop Header: Depth=2
-; GFX7-NEXT: v_readfirstlane_b32 s8, v0
-; GFX7-NEXT: v_readfirstlane_b32 s9, v1
-; GFX7-NEXT: v_readfirstlane_b32 s10, v2
-; GFX7-NEXT: v_readfirstlane_b32 s11, v3
-; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
-; GFX7-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
-; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
+; GFX7-NEXT: v_mov_b32_e32 v4, v0
+; GFX7-NEXT: v_mov_b32_e32 v0, s8
+; GFX7-NEXT: v_mov_b32_e32 v5, v1
+; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v0, s[4:7], 0 offen offset:2048
+; GFX7-NEXT: s_add_i32 s10, s8, 0x800
+; GFX7-NEXT: s_mov_b64 s[8:9], 0
+; GFX7-NEXT: v_mov_b32_e32 v6, s10
+; GFX7-NEXT: .LBB8_1: ; %atomicrmw.start
+; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc
-; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5]
-; GFX7-NEXT: s_cbranch_execnz .LBB8_4
-; GFX7-NEXT: ; %bb.5: ; in Loop: Header=BB8_3 Depth=1
-; GFX7-NEXT: s_mov_b64 exec, s[12:13]
+; GFX7-NEXT: v_mov_b32_e32 v10, v1
+; GFX7-NEXT: v_mov_b32_e32 v9, v0
+; GFX7-NEXT: v_add_f64 v[7:8], v[9:10], v[4:5]
+; GFX7-NEXT: v_mov_b32_e32 v0, v7
+; GFX7-NEXT: v_mov_b32_e32 v1, v8
+; GFX7-NEXT: v_mov_b32_e32 v2, v9
+; GFX7-NEXT: v_mov_b32_e32 v3, v10
+; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[4:7], 0 offen glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6
-; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX7-NEXT: v_mov_b32_e32 v6, v4
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7]
-; GFX7-NEXT: s_cbranch_execnz .LBB8_3
-; GFX7-NEXT: ; %bb.6: ; %atomicrmw.end
-; GFX7-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, v7, v4
-; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX7-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall:
-; GFX6: ; %bb.0:
-; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0x200, v4
-; GFX6-NEXT: v_and_b32_e32 v8, -4, v4
-; GFX6-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX6-NEXT: v_lshlrev_b32_e32 v7, 3, v4
-; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v7
-; GFX6-NEXT: v_not_b32_e32 v9, v4
-; GFX6-NEXT: s_mov_b64 s[6:7], exec
-; GFX6-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT: v_readfirstlane_b32 s8, v0
-; GFX6-NEXT: v_readfirstlane_b32 s9, v1
-; GFX6-NEXT: v_readfirstlane_b32 s10, v2
-; GFX6-NEXT: v_readfirstlane_b32 s11, v3
-; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
-; GFX6-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
-; GFX6-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
-; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
-; GFX6-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen
-; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5]
-; GFX6-NEXT: s_cbranch_execnz .LBB8_1
-; GFX6-NEXT: ; %bb.2:
-; GFX6-NEXT: s_mov_b64 exec, s[6:7]
-; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v5
-; GFX6-NEXT: s_mov_b64 s[6:7], 0
-; GFX6-NEXT: v_cvt_f32_f16_e32 v10, v4
-; GFX6-NEXT: .LBB8_3: ; %atomicrmw.start
-; GFX6-NEXT: ; =>This Loop Header: Depth=1
-; GFX6-NEXT: ; Child Loop BB8_4 Depth 2
-; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0)
-; GFX6-NEXT: v_lshrrev_b32_e32 v4, v7, v6
-; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4
-; GFX6-NEXT: v_and_b32_e32 v5, v6, v9
-; GFX6-NEXT: s_mov_b64 s[12:13], exec
-; GFX6-NEXT: v_add_f32_e32 v4, v4, v10
-; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4
-; GFX6-NEXT: v_lshlrev_b32_e32 v4, v7, v4
-; GFX6-NEXT: v_or_b32_e32 v5, v5, v4
-; GFX6-NEXT: v_mov_b32_e32 v4, v5
-; GFX6-NEXT: v_mov_b32_e32 v5, v6
-; GFX6-NEXT: .LBB8_4: ; Parent Loop BB8_3 Depth=1
-; GFX6-NEXT: ; => This Inner Loop Header: Depth=2
-; GFX6-NEXT: v_readfirstlane_b32 s8, v0
-; GFX6-NEXT: v_readfirstlane_b32 s9, v1
-; GFX6-NEXT: v_readfirstlane_b32 s10, v2
-; GFX6-NEXT: v_readfirstlane_b32 s11, v3
-; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
-; GFX6-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
-; GFX6-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
-; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
-; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0)
-; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc
-; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5]
-; GFX6-NEXT: s_cbranch_execnz .LBB8_4
-; GFX6-NEXT: ; %bb.5: ; in Loop: Header=BB8_3 Depth=1
-; GFX6-NEXT: s_mov_b64 exec, s[12:13]
+; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
+; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
+; GFX7-NEXT: s_cbranch_execnz .LBB8_1
+; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_grained_memory:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v4, v0
+; GFX6-NEXT: v_mov_b32_e32 v0, s8
+; GFX6-NEXT: v_mov_b32_e32 v5, v1
+; GFX6-NEXT: buffer_load_dwordx2 v[0:1], v0, s[4:7], 0 offen offset:2048
+; GFX6-NEXT: s_add_i32 s10, s8, 0x800
+; GFX6-NEXT: s_mov_b64 s[8:9], 0
+; GFX6-NEXT: v_mov_b32_e32 v6, s10
+; GFX6-NEXT: .LBB8_1: ; %atomicrmw.start
+; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v10, v1
+; GFX6-NEXT: v_mov_b32_e32 v9, v0
+; GFX6-NEXT: v_add_f64 v[7:8], v[9:10], v[4:5]
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v0, v7
+; GFX6-NEXT: v_mov_b32_e32 v1, v8
+; GFX6-NEXT: v_mov_b32_e32 v2, v9
+; GFX6-NEXT: v_mov_b32_e32 v3, v10
+; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[4:7], 0 offen glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6
-; GFX6-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX6-NEXT: v_mov_b32_e32 v6, v4
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: s_andn2_b64 exec, exec, s[6:7]
-; GFX6-NEXT: s_cbranch_execnz .LBB8_3
-; GFX6-NEXT: ; %bb.6: ; %atomicrmw.end
-; GFX6-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX6-NEXT: v_lshrrev_b32_e32 v0, v7, v4
-; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
+; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
+; GFX6-NEXT: s_cbranch_execnz .LBB8_1
+; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
- %gep = getelementptr half, ptr addrspace(7) %ptr, i32 256
- %result = atomicrmw fadd ptr addrspace(7) %gep, half %val syncscope("agent") seq_cst
- ret half %result
+ %gep = getelementptr double, ptr addrspace(7) %ptr, i32 256
+ %result = atomicrmw fadd ptr addrspace(7) %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+ ret double %result
}
-; --------------------------------------------------------------------
-; bfloat
-; --------------------------------------------------------------------
-
-define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset(ptr addrspace(7) inreg %ptr, bfloat %val) #0 {
-; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset:
+define void @buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, double %val) #0 {
+; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset__amdgpu_no_fine_grained_memory:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: s_wait_expcnt 0x0
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_addk_co_i32 s4, 0x200
-; GFX12-NEXT: v_lshlrev_b32_e32 v5, 16, v0
-; GFX12-NEXT: s_and_b32 s5, s4, -4
-; GFX12-NEXT: s_and_b32 s4, s4, 3
-; GFX12-NEXT: v_mov_b32_e32 v4, s5
-; GFX12-NEXT: s_lshl_b32 s4, s4, 3
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX12-NEXT: s_lshl_b32 s5, 0xffff, s4
-; GFX12-NEXT: s_not_b32 s6, s5
-; GFX12-NEXT: buffer_load_b32 v1, v4, s[0:3], null offen
-; GFX12-NEXT: s_mov_b32 s5, 0
+; GFX12-NEXT: v_mov_b32_e32 v2, s4
+; GFX12-NEXT: s_addk_co_i32 s4, 0x800
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: v_mov_b32_e32 v6, s4
+; GFX12-NEXT: s_mov_b32 s4, 0
+; GFX12-NEXT: buffer_load_b64 v[4:5], v2, s[0:3], null offen offset:2048
; GFX12-NEXT: .LBB9_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v1
+; GFX12-NEXT: v_add_f64_e32 v[2:3], v[4:5], v[0:1]
+; GFX12-NEXT: v_dual_mov_b32 v10, v5 :: v_dual_mov_b32 v9, v4
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX12-NEXT: v_add_f32_e32 v0, v0, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-NEXT: v_bfe_u32 v2, v0, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v3, 0x400000, v0
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX12-NEXT: v_add3_u32 v2, v2, v0, 0x7fff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo
-; GFX12-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e32 v0, s4, v0
-; GFX12-NEXT: v_and_or_b32 v0, v1, s6, v0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
-; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX12-NEXT: v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v7, v2
+; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[7:10], v6, s[0:3], null offen th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
-; GFX12-NEXT: v_mov_b32_e32 v1, v2
-; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5
+; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[4:5]
+; GFX12-NEXT: v_dual_mov_b32 v4, v7 :: v_dual_mov_b32 v5, v8
+; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
+; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX12-NEXT: s_cbranch_execnz .LBB9_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5
-; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v2
+; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
-; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset:
+; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset__amdgpu_no_fine_grained_memory:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: s_addk_i32 s4, 0x200
-; GFX940-NEXT: s_and_b32 s5, s4, -4
-; GFX940-NEXT: v_mov_b32_e32 v4, s5
-; GFX940-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen
-; GFX940-NEXT: s_and_b32 s4, s4, 3
-; GFX940-NEXT: s_lshl_b32 s6, s4, 3
-; GFX940-NEXT: s_lshl_b32 s4, 0xffff, s6
-; GFX940-NEXT: s_not_b32 s7, s4
-; GFX940-NEXT: s_mov_b64 s[4:5], 0
-; GFX940-NEXT: v_lshlrev_b32_e32 v5, 16, v0
-; GFX940-NEXT: s_movk_i32 s8, 0x7fff
-; GFX940-NEXT: .LBB9_1: ; %atomicrmw.start
-; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX940-NEXT: v_mov_b32_e32 v2, s4
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: v_add_f32_e32 v0, v0, v5
-; GFX940-NEXT: v_bfe_u32 v2, v0, 16, 1
-; GFX940-NEXT: v_or_b32_e32 v3, 0x400000, v0
-; GFX940-NEXT: v_add3_u32 v2, v2, v0, s8
-; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; GFX940-NEXT: s_nop 1
-; GFX940-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
-; GFX940-NEXT: v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX940-NEXT: v_and_or_b32 v0, v1, s7, v0
-; GFX940-NEXT: v_mov_b64_e32 v[2:3], v[0:1]
-; GFX940-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[0:3], 0 offen sc0
+; GFX940-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 offen offset:2048
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
-; GFX940-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX940-NEXT: v_mov_b32_e32 v1, v2
-; GFX940-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX940-NEXT: s_cbranch_execnz .LBB9_1
-; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX940-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX940-NEXT: v_lshrrev_b32_e32 v0, s6, v2
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset:
+; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset__amdgpu_no_fine_grained_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: s_addk_i32 s4, 0x200
-; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v0
-; GFX11-NEXT: s_and_b32 s5, s4, -4
-; GFX11-NEXT: s_and_b32 s4, s4, 3
-; GFX11-NEXT: v_mov_b32_e32 v4, s5
-; GFX11-NEXT: s_lshl_b32 s4, s4, 3
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: s_lshl_b32 s5, 0xffff, s4
-; GFX11-NEXT: s_not_b32 s6, s5
-; GFX11-NEXT: buffer_load_b32 v1, v4, s[0:3], 0 offen
-; GFX11-NEXT: s_mov_b32 s5, 0
-; GFX11-NEXT: .p2align 6
+; GFX11-NEXT: v_mov_b32_e32 v2, s4
+; GFX11-NEXT: s_addk_i32 s4, 0x800
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: v_mov_b32_e32 v6, s4
+; GFX11-NEXT: s_mov_b32 s4, 0
+; GFX11-NEXT: buffer_load_b64 v[4:5], v2, s[0:3], 0 offen offset:2048
; GFX11-NEXT: .LBB9_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, s4, v1
+; GFX11-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1]
+; GFX11-NEXT: v_dual_mov_b32 v10, v5 :: v_dual_mov_b32 v9, v4
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-NEXT: v_add_f32_e32 v0, v0, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v3, 0x400000, v0
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT: v_add3_u32 v2, v2, v0, 0x7fff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v0, s4, v0
-; GFX11-NEXT: v_and_or_b32 v0, v1, s6, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
-; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], 0 offen glc
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT: v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v7, v2
+; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[7:10], v6, s[0:3], 0 offen glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
-; GFX11-NEXT: v_mov_b32_e32 v1, v2
-; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5
+; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[4:5]
+; GFX11-NEXT: v_dual_mov_b32 v4, v7 :: v_dual_mov_b32 v5, v8
+; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
+; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
; GFX11-NEXT: s_cbranch_execnz .LBB9_1
; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s5
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, s4, v2
+; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
-; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset:
+; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: s_addk_i32 s8, 0x200
-; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v0
-; GFX10-NEXT: s_and_b32 s9, s8, -4
-; GFX10-NEXT: s_and_b32 s8, s8, 3
-; GFX10-NEXT: v_mov_b32_e32 v4, s9
-; GFX10-NEXT: s_lshl_b32 s8, s8, 3
-; GFX10-NEXT: s_lshl_b32 s9, 0xffff, s8
-; GFX10-NEXT: s_not_b32 s10, s9
-; GFX10-NEXT: buffer_load_dword v1, v4, s[4:7], 0 offen
-; GFX10-NEXT: s_mov_b32 s9, 0
+; GFX10-NEXT: v_mov_b32_e32 v2, s8
+; GFX10-NEXT: s_addk_i32 s8, 0x800
+; GFX10-NEXT: v_mov_b32_e32 v6, s8
+; GFX10-NEXT: s_mov_b32 s8, 0
+; GFX10-NEXT: buffer_load_dwordx2 v[4:5], v2, s[4:7], 0 offen offset:2048
; GFX10-NEXT: .LBB9_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_lshrrev_b32_sdwa v0, s8, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX10-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1]
+; GFX10-NEXT: v_mov_b32_e32 v10, v5
+; GFX10-NEXT: v_mov_b32_e32 v9, v4
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: v_add_f32_e32 v0, v0, v5
-; GFX10-NEXT: v_bfe_u32 v2, v0, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v3, 0x400000, v0
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX10-NEXT: v_add3_u32 v2, v2, v0, 0x7fff
-; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo
-; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX10-NEXT: v_and_or_b32 v0, v1, s10, v0
-; GFX10-NEXT: v_mov_b32_e32 v3, v1
-; GFX10-NEXT: v_mov_b32_e32 v2, v0
-; GFX10-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[4:7], 0 offen glc
+; GFX10-NEXT: v_mov_b32_e32 v8, v3
+; GFX10-NEXT: v_mov_b32_e32 v7, v2
+; GFX10-NEXT: buffer_atomic_cmpswap_x2 v[7:10], v6, s[4:7], 0 offen glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
-; GFX10-NEXT: v_mov_b32_e32 v1, v2
-; GFX10-NEXT: s_or_b32 s9, vcc_lo, s9
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s9
+; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[4:5]
+; GFX10-NEXT: v_mov_b32_e32 v4, v7
+; GFX10-NEXT: v_mov_b32_e32 v5, v8
+; GFX10-NEXT: s_or_b32 s8, vcc_lo, s8
+; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s8
; GFX10-NEXT: s_cbranch_execnz .LBB9_1
; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s9
-; GFX10-NEXT: v_lshrrev_b32_e32 v0, s8, v2
+; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s8
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset:
+; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: s_addk_i32 s8, 0x200
-; GFX90A-NEXT: s_and_b32 s9, s8, -4
-; GFX90A-NEXT: v_mov_b32_e32 v4, s9
-; GFX90A-NEXT: buffer_load_dword v1, v4, s[4:7], 0 offen
-; GFX90A-NEXT: s_and_b32 s8, s8, 3
-; GFX90A-NEXT: s_lshl_b32 s10, s8, 3
-; GFX90A-NEXT: s_lshl_b32 s8, 0xffff, s10
-; GFX90A-NEXT: s_not_b32 s11, s8
-; GFX90A-NEXT: s_mov_b64 s[8:9], 0
-; GFX90A-NEXT: v_lshlrev_b32_e32 v5, 16, v0
-; GFX90A-NEXT: s_movk_i32 s12, 0x7fff
-; GFX90A-NEXT: .LBB9_1: ; %atomicrmw.start
-; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_lshrrev_b32_sdwa v0, s10, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX90A-NEXT: v_add_f32_e32 v0, v0, v5
-; GFX90A-NEXT: v_bfe_u32 v2, v0, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v3, 0x400000, v0
-; GFX90A-NEXT: v_add3_u32 v2, v2, v0, s12
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; GFX90A-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
-; GFX90A-NEXT: v_lshlrev_b32_sdwa v0, s10, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX90A-NEXT: v_and_or_b32 v0, v1, s11, v0
-; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
-; GFX90A-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[4:7], 0 offen glc
+; GFX90A-NEXT: v_mov_b32_e32 v2, s8
+; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[4:7], 0 offen offset:2048
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
-; GFX90A-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX90A-NEXT: v_mov_b32_e32 v1, v2
-; GFX90A-NEXT: s_andn2_b64 exec, exec, s[8:9]
-; GFX90A-NEXT: s_cbranch_execnz .LBB9_1
-; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX90A-NEXT: s_or_b64 exec, exec, s[8:9]
-; GFX90A-NEXT: v_lshrrev_b32_e32 v0, s10, v2
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
-; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset:
+; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset__amdgpu_no_fine_grained_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: s_addk_i32 s8, 0x200
-; GFX908-NEXT: s_and_b32 s9, s8, -4
-; GFX908-NEXT: v_mov_b32_e32 v4, s9
-; GFX908-NEXT: buffer_load_dword v1, v4, s[4:7], 0 offen
-; GFX908-NEXT: s_and_b32 s8, s8, 3
-; GFX908-NEXT: s_lshl_b32 s10, s8, 3
-; GFX908-NEXT: s_lshl_b32 s8, 0xffff, s10
-; GFX908-NEXT: s_not_b32 s11, s8
+; GFX908-NEXT: v_mov_b32_e32 v2, s8
+; GFX908-NEXT: buffer_load_dwordx2 v[4:5], v2, s[4:7], 0 offen offset:2048
+; GFX908-NEXT: s_add_i32 s10, s8, 0x800
; GFX908-NEXT: s_mov_b64 s[8:9], 0
-; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v0
-; GFX908-NEXT: s_movk_i32 s12, 0x7fff
+; GFX908-NEXT: v_mov_b32_e32 v6, s10
; GFX908-NEXT: .LBB9_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_lshrrev_b32_sdwa v0, s10, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX908-NEXT: v_add_f32_e32 v0, v0, v5
-; GFX908-NEXT: v_bfe_u32 v2, v0, 16, 1
-; GFX908-NEXT: v_or_b32_e32 v3, 0x400000, v0
-; GFX908-NEXT: v_add3_u32 v2, v2, v0, s12
-; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; GFX908-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
-; GFX908-NEXT: v_lshlrev_b32_sdwa v0, s10, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX908-NEXT: v_and_or_b32 v0, v1, s11, v0
-; GFX908-NEXT: v_mov_b32_e32 v3, v1
-; GFX908-NEXT: v_mov_b32_e32 v2, v0
-; GFX908-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[4:7], 0 offen glc
+; GFX908-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1]
+; GFX908-NEXT: v_mov_b32_e32 v10, v5
+; GFX908-NEXT: v_mov_b32_e32 v9, v4
+; GFX908-NEXT: v_mov_b32_e32 v8, v3
+; GFX908-NEXT: v_mov_b32_e32 v7, v2
+; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[7:10], v6, s[4:7], 0 offen glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[7:8], v[4:5]
+; GFX908-NEXT: v_mov_b32_e32 v4, v7
; GFX908-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX908-NEXT: v_mov_b32_e32 v1, v2
+; GFX908-NEXT: v_mov_b32_e32 v5, v8
; GFX908-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX908-NEXT: s_cbranch_execnz .LBB9_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX908-NEXT: s_or_b64 exec, exec, s[8:9]
-; GFX908-NEXT: v_lshrrev_b32_e32 v0, s10, v2
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
-; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset:
+; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset__amdgpu_no_fine_grained_memory:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: s_addk_i32 s8, 0x200
-; GFX8-NEXT: s_and_b32 s9, s8, -4
-; GFX8-NEXT: v_mov_b32_e32 v4, s9
-; GFX8-NEXT: buffer_load_dword v1, v4, s[4:7], 0 offen
-; GFX8-NEXT: s_and_b32 s8, s8, 3
-; GFX8-NEXT: s_lshl_b32 s10, s8, 3
-; GFX8-NEXT: s_lshl_b32 s8, 0xffff, s10
-; GFX8-NEXT: s_not_b32 s11, s8
+; GFX8-NEXT: v_mov_b32_e32 v2, s8
+; GFX8-NEXT: buffer_load_dwordx2 v[4:5], v2, s[4:7], 0 offen offset:2048
+; GFX8-NEXT: s_add_i32 s10, s8, 0x800
; GFX8-NEXT: s_mov_b64 s[8:9], 0
-; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v0
+; GFX8-NEXT: v_mov_b32_e32 v6, s10
; GFX8-NEXT: .LBB9_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT: v_mov_b32_e32 v0, s10
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_lshrrev_b32_sdwa v3, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT: v_add_f32_e32 v3, v3, v5
-; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6
-; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3
-; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; GFX8-NEXT: v_cndmask_b32_e32 v3, v6, v7, vcc
-; GFX8-NEXT: v_and_b32_e32 v2, s11, v1
-; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-NEXT: v_or_b32_e32 v0, v2, v0
-; GFX8-NEXT: v_mov_b32_e32 v3, v1
-; GFX8-NEXT: v_mov_b32_e32 v2, v0
-; GFX8-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[4:7], 0 offen glc
+; GFX8-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1]
+; GFX8-NEXT: v_mov_b32_e32 v10, v5
+; GFX8-NEXT: v_mov_b32_e32 v9, v4
+; GFX8-NEXT: v_mov_b32_e32 v8, v3
+; GFX8-NEXT: v_mov_b32_e32 v7, v2
+; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[7:10], v6, s[4:7], 0 offen glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[7:8], v[4:5]
+; GFX8-NEXT: v_mov_b32_e32 v4, v7
; GFX8-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX8-NEXT: v_mov_b32_e32 v1, v2
+; GFX8-NEXT: v_mov_b32_e32 v5, v8
; GFX8-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX8-NEXT: s_cbranch_execnz .LBB9_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX8-NEXT: s_or_b64 exec, exec, s[8:9]
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, s10, v2
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset:
+; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset__amdgpu_no_fine_grained_memory:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: s_addk_i32 s8, 0x200
-; GFX7-NEXT: s_and_b32 s9, s8, -4
-; GFX7-NEXT: v_mov_b32_e32 v4, s9
-; GFX7-NEXT: buffer_load_dword v1, v4, s[4:7], 0 offen
-; GFX7-NEXT: s_and_b32 s8, s8, 3
-; GFX7-NEXT: s_lshl_b32 s10, s8, 3
-; GFX7-NEXT: s_lshl_b32 s8, 0xffff, s10
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT: s_not_b32 s11, s8
+; GFX7-NEXT: v_mov_b32_e32 v2, s8
+; GFX7-NEXT: buffer_load_dwordx2 v[4:5], v2, s[4:7], 0 offen offset:2048
+; GFX7-NEXT: s_add_i32 s10, s8, 0x800
; GFX7-NEXT: s_mov_b64 s[8:9], 0
-; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v0
+; GFX7-NEXT: v_mov_b32_e32 v6, s10
; GFX7-NEXT: .LBB9_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, s10, v1
-; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX7-NEXT: v_add_f32_e32 v0, v0, v5
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX7-NEXT: v_and_b32_e32 v2, s11, v1
-; GFX7-NEXT: v_lshlrev_b32_e32 v0, s10, v0
-; GFX7-NEXT: v_or_b32_e32 v0, v2, v0
-; GFX7-NEXT: v_mov_b32_e32 v3, v1
-; GFX7-NEXT: v_mov_b32_e32 v2, v0
-; GFX7-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[4:7], 0 offen glc
+; GFX7-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1]
+; GFX7-NEXT: v_mov_b32_e32 v10, v5
+; GFX7-NEXT: v_mov_b32_e32 v9, v4
+; GFX7-NEXT: v_mov_b32_e32 v8, v3
+; GFX7-NEXT: v_mov_b32_e32 v7, v2
+; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[7:10], v6, s[4:7], 0 offen glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[7:8], v[4:5]
+; GFX7-NEXT: v_mov_b32_e32 v4, v7
; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT: v_mov_b32_e32 v1, v2
+; GFX7-NEXT: v_mov_b32_e32 v5, v8
; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_cbranch_execnz .LBB9_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, s10, v2
-; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
-; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset:
+; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f64__offset__amdgpu_no_fine_grained_memory:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT: s_addk_i32 s8, 0x200
-; GFX6-NEXT: s_and_b32 s9, s8, -4
-; GFX6-NEXT: v_mov_b32_e32 v4, s9
-; GFX6-NEXT: buffer_load_dword v1, v4, s[4:7], 0 offen
-; GFX6-NEXT: s_and_b32 s8, s8, 3
-; GFX6-NEXT: s_lshl_b32 s10, s8, 3
-; GFX6-NEXT: s_lshl_b32 s8, 0xffff, s10
-; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX6-NEXT: s_not_b32 s11, s8
+; GFX6-NEXT: v_mov_b32_e32 v2, s8
+; GFX6-NEXT: buffer_load_dwordx2 v[4:5], v2, s[4:7], 0 offen offset:2048
+; GFX6-NEXT: s_add_i32 s10, s8, 0x800
; GFX6-NEXT: s_mov_b64 s[8:9], 0
-; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v0
+; GFX6-NEXT: v_mov_b32_e32 v6, s10
; GFX6-NEXT: .LBB9_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_lshrrev_b32_e32 v0, s10, v1
-; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX6-NEXT: v_add_f32_e32 v0, v0, v5
-; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX6-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1]
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v2, s11, v1
-; GFX6-NEXT: v_lshlrev_b32_e32 v0, s10, v0
-; GFX6-NEXT: v_or_b32_e32 v0, v2, v0
-; GFX6-NEXT: v_mov_b32_e32 v3, v1
-; GFX6-NEXT: v_mov_b32_e32 v2, v0
-; GFX6-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[4:7], 0 offen glc
+; GFX6-NEXT: v_mov_b32_e32 v10, v5
+; GFX6-NEXT: v_mov_b32_e32 v9, v4
+; GFX6-NEXT: v_mov_b32_e32 v8, v3
+; GFX6-NEXT: v_mov_b32_e32 v7, v2
+; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[7:10], v6, s[4:7], 0 offen glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[7:8], v[4:5]
+; GFX6-NEXT: v_mov_b32_e32 v4, v7
; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT: v_mov_b32_e32 v1, v2
+; GFX6-NEXT: v_mov_b32_e32 v5, v8
; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_cbranch_execnz .LBB9_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
-; GFX6-NEXT: v_lshrrev_b32_e32 v0, s10, v2
-; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
- %gep = getelementptr bfloat, ptr addrspace(7) %ptr, i32 256
- %result = atomicrmw fadd ptr addrspace(7) %gep, bfloat %val syncscope("agent") seq_cst
- ret bfloat %result
+ %gep = getelementptr double, ptr addrspace(7) %ptr, i32 256
+ %unused = atomicrmw fadd ptr addrspace(7) %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+ ret void
}
-define void @buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset(ptr addrspace(7) inreg %ptr, bfloat %val) #0 {
-; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset:
+define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory(ptr addrspace(7) %ptr, double %val) #0 {
+; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: s_wait_expcnt 0x0
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_addk_co_i32 s4, 0x200
-; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v0
-; GFX12-NEXT: s_and_b32 s5, s4, -4
-; GFX12-NEXT: s_and_b32 s4, s4, 3
-; GFX12-NEXT: v_mov_b32_e32 v2, s5
-; GFX12-NEXT: s_lshl_b32 s4, s4, 3
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX12-NEXT: s_lshl_b32 s5, 0xffff, s4
-; GFX12-NEXT: s_not_b32 s6, s5
-; GFX12-NEXT: buffer_load_b32 v1, v2, s[0:3], null offen
-; GFX12-NEXT: s_mov_b32 s5, 0
-; GFX12-NEXT: .LBB10_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT: v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v7, v2
+; GFX12-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0
+; GFX12-NEXT: v_add_nc_u32_e32 v15, 0x800, v4
+; GFX12-NEXT: s_mov_b32 s1, exec_lo
+; GFX12-NEXT: .LBB10_1: ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-NEXT: v_readfirstlane_b32 s4, v9
+; GFX12-NEXT: v_readfirstlane_b32 s5, v10
+; GFX12-NEXT: v_readfirstlane_b32 s6, v7
+; GFX12-NEXT: v_readfirstlane_b32 s7, v8
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10]
+; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[7:8]
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX12-NEXT: s_and_saveexec_b32 s0, s0
+; GFX12-NEXT: buffer_load_b64 v[13:14], v4, s[4:7], null offen offset:2048
+; GFX12-NEXT: ; implicit-def: $vgpr4
+; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_cbranch_execnz .LBB10_1
+; GFX12-NEXT: ; %bb.2:
+; GFX12-NEXT: s_mov_b32 exec_lo, s1
+; GFX12-NEXT: s_mov_b32 s1, 0
+; GFX12-NEXT: .LBB10_3: ; %atomicrmw.start
+; GFX12-NEXT: ; =>This Loop Header: Depth=1
+; GFX12-NEXT: ; Child Loop BB10_4 Depth 2
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v1
+; GFX12-NEXT: v_add_f64_e32 v[11:12], v[13:14], v[5:6]
+; GFX12-NEXT: s_mov_b32 s2, exec_lo
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX12-NEXT: v_add_f32_e32 v0, v0, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-NEXT: v_bfe_u32 v4, v0, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v5, 0x400000, v0
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX12-NEXT: v_add3_u32 v4, v4, v0, 0x7fff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc_lo
-; GFX12-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e32 v0, s4, v0
-; GFX12-NEXT: v_and_or_b32 v0, v1, s6, v0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
-; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT: v_dual_mov_b32 v0, v11 :: v_dual_mov_b32 v1, v12
+; GFX12-NEXT: v_dual_mov_b32 v2, v13 :: v_dual_mov_b32 v3, v14
+; GFX12-NEXT: .LBB10_4: ; Parent Loop BB10_3 Depth=1
+; GFX12-NEXT: ; => This Inner Loop Header: Depth=2
+; GFX12-NEXT: v_readfirstlane_b32 s4, v9
+; GFX12-NEXT: v_readfirstlane_b32 s5, v10
+; GFX12-NEXT: v_readfirstlane_b32 s6, v7
+; GFX12-NEXT: v_readfirstlane_b32 s7, v8
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10]
+; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[7:8]
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX12-NEXT: s_and_saveexec_b32 s0, s0
; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v15, s[4:7], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_cbranch_execnz .LBB10_4
+; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB10_3 Depth=1
+; GFX12-NEXT: s_mov_b32 exec_lo, s2
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[13:14]
+; GFX12-NEXT: v_dual_mov_b32 v14, v1 :: v_dual_mov_b32 v13, v0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1
-; GFX12-NEXT: v_mov_b32_e32 v1, v4
-; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5
+; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
-; GFX12-NEXT: s_cbranch_execnz .LBB10_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5
+; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX12-NEXT: s_cbranch_execnz .LBB10_3
+; GFX12-NEXT: ; %bb.6: ; %atomicrmw.end
+; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
-; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset:
+; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: s_addk_i32 s4, 0x200
-; GFX940-NEXT: s_and_b32 s5, s4, -4
-; GFX940-NEXT: v_mov_b32_e32 v2, s5
-; GFX940-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen
-; GFX940-NEXT: s_and_b32 s4, s4, 3
-; GFX940-NEXT: s_lshl_b32 s6, s4, 3
-; GFX940-NEXT: s_lshl_b32 s4, 0xffff, s6
-; GFX940-NEXT: s_not_b32 s7, s4
-; GFX940-NEXT: s_mov_b64 s[4:5], 0
-; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v0
-; GFX940-NEXT: s_movk_i32 s8, 0x7fff
-; GFX940-NEXT: .LBB10_1: ; %atomicrmw.start
-; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX940-NEXT: v_mov_b32_e32 v7, v6
+; GFX940-NEXT: v_mov_b32_e32 v6, v5
+; GFX940-NEXT: s_mov_b64 s[2:3], exec
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: v_add_f32_e32 v0, v0, v3
-; GFX940-NEXT: v_bfe_u32 v4, v0, 16, 1
-; GFX940-NEXT: v_or_b32_e32 v5, 0x400000, v0
-; GFX940-NEXT: v_add3_u32 v4, v4, v0, s8
-; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; GFX940-NEXT: s_nop 1
-; GFX940-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc
-; GFX940-NEXT: v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX940-NEXT: v_and_or_b32 v0, v1, s7, v0
-; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[0:1]
-; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[0:3], 0 offen sc0
+; GFX940-NEXT: .LBB10_1: ; =>This Inner Loop Header: Depth=1
+; GFX940-NEXT: v_readfirstlane_b32 s4, v0
+; GFX940-NEXT: v_readfirstlane_b32 s5, v1
+; GFX940-NEXT: v_readfirstlane_b32 s6, v2
+; GFX940-NEXT: v_readfirstlane_b32 s7, v3
+; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1]
+; GFX940-NEXT: s_nop 0
+; GFX940-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
+; GFX940-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
+; GFX940-NEXT: s_and_saveexec_b64 s[0:1], s[0:1]
; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1
-; GFX940-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX940-NEXT: v_mov_b32_e32 v1, v4
-; GFX940-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX940-NEXT: buffer_atomic_add_f64 v[6:7], v4, s[4:7], 0 offen offset:2048 sc0
+; GFX940-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
+; GFX940-NEXT: ; implicit-def: $vgpr4
+; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1]
; GFX940-NEXT: s_cbranch_execnz .LBB10_1
-; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX940-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX940-NEXT: ; %bb.2:
+; GFX940-NEXT: s_mov_b64 exec, s[2:3]
+; GFX940-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NEXT: v_mov_b32_e32 v0, v6
+; GFX940-NEXT: v_mov_b32_e32 v1, v7
+; GFX940-NEXT: buffer_inv sc1
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset:
+; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: s_addk_i32 s4, 0x200
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v0
-; GFX11-NEXT: s_and_b32 s5, s4, -4
-; GFX11-NEXT: s_and_b32 s4, s4, 3
-; GFX11-NEXT: v_mov_b32_e32 v2, s5
-; GFX11-NEXT: s_lshl_b32 s4, s4, 3
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: s_lshl_b32 s5, 0xffff, s4
-; GFX11-NEXT: s_not_b32 s6, s5
-; GFX11-NEXT: buffer_load_b32 v1, v2, s[0:3], 0 offen
-; GFX11-NEXT: s_mov_b32 s5, 0
+; GFX11-NEXT: v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v7, v2
+; GFX11-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0
+; GFX11-NEXT: v_add_nc_u32_e32 v15, 0x800, v4
+; GFX11-NEXT: s_mov_b32 s1, 0
+; GFX11-NEXT: s_mov_b32 s2, exec_lo
+; GFX11-NEXT: .LBB10_1: ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_readfirstlane_b32 s4, v9
+; GFX11-NEXT: v_readfirstlane_b32 s5, v10
+; GFX11-NEXT: v_readfirstlane_b32 s6, v7
+; GFX11-NEXT: v_readfirstlane_b32 s7, v8
+; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10]
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[7:8]
+; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_and_saveexec_b32 s0, s0
+; GFX11-NEXT: buffer_load_b64 v[13:14], v4, s[4:7], 0 offen offset:2048
+; GFX11-NEXT: ; implicit-def: $vgpr4
+; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT: s_cbranch_execnz .LBB10_1
+; GFX11-NEXT: ; %bb.2:
+; GFX11-NEXT: s_mov_b32 exec_lo, s2
; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB10_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT: .LBB10_3: ; %atomicrmw.start
+; GFX11-NEXT: ; =>This Loop Header: Depth=1
+; GFX11-NEXT: ; Child Loop BB10_4 Depth 2
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, s4, v1
+; GFX11-NEXT: v_add_f64 v[11:12], v[13:14], v[5:6]
+; GFX11-NEXT: s_mov_b32 s2, exec_lo
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX11-NEXT: v_add_f32_e32 v0, v0, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_bfe_u32 v4, v0, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v0
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX11-NEXT: v_add3_u32 v4, v4, v0, 0x7fff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc_lo
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v0, s4, v0
-; GFX11-NEXT: v_and_or_b32 v0, v1, s6, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
-; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], 0 offen glc
+; GFX11-NEXT: v_dual_mov_b32 v0, v11 :: v_dual_mov_b32 v1, v12
+; GFX11-NEXT: v_dual_mov_b32 v2, v13 :: v_dual_mov_b32 v3, v14
+; GFX11-NEXT: .LBB10_4: ; Parent Loop BB10_3 Depth=1
+; GFX11-NEXT: ; => This Inner Loop Header: Depth=2
+; GFX11-NEXT: v_readfirstlane_b32 s4, v9
+; GFX11-NEXT: v_readfirstlane_b32 s5, v10
+; GFX11-NEXT: v_readfirstlane_b32 s6, v7
+; GFX11-NEXT: v_readfirstlane_b32 s7, v8
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10]
+; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[7:8]
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX11-NEXT: s_and_saveexec_b32 s0, s0
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v15, s[4:7], 0 offen glc
+; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT: s_cbranch_execnz .LBB10_4
+; GFX11-NEXT: ; %bb.5: ; in Loop: Header=BB10_3 Depth=1
+; GFX11-NEXT: s_mov_b32 exec_lo, s2
; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[13:14]
+; GFX11-NEXT: v_dual_mov_b32 v14, v1 :: v_dual_mov_b32 v13, v0
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1
-; GFX11-NEXT: v_mov_b32_e32 v1, v4
-; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5
+; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
-; GFX11-NEXT: s_cbranch_execnz .LBB10_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s5
+; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX11-NEXT: s_cbranch_execnz .LBB10_3
+; GFX11-NEXT: ; %bb.6: ; %atomicrmw.end
+; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
-; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset:
+; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: s_addk_i32 s8, 0x200
-; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v0
-; GFX10-NEXT: s_and_b32 s9, s8, -4
-; GFX10-NEXT: s_and_b32 s8, s8, 3
-; GFX10-NEXT: v_mov_b32_e32 v2, s9
-; GFX10-NEXT: s_lshl_b32 s8, s8, 3
-; GFX10-NEXT: s_lshl_b32 s9, 0xffff, s8
-; GFX10-NEXT: s_not_b32 s10, s9
-; GFX10-NEXT: buffer_load_dword v1, v2, s[4:7], 0 offen
-; GFX10-NEXT: s_mov_b32 s9, 0
-; GFX10-NEXT: .LBB10_1: ; %atomicrmw.start
-; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT: v_mov_b32_e32 v8, v3
+; GFX10-NEXT: v_mov_b32_e32 v7, v2
+; GFX10-NEXT: v_mov_b32_e32 v10, v1
+; GFX10-NEXT: v_mov_b32_e32 v9, v0
+; GFX10-NEXT: v_add_nc_u32_e32 v15, 0x800, v4
+; GFX10-NEXT: s_mov_b32 s5, 0
+; GFX10-NEXT: s_mov_b32 s6, exec_lo
+; GFX10-NEXT: .LBB10_1: ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT: v_readfirstlane_b32 s8, v9
+; GFX10-NEXT: v_readfirstlane_b32 s9, v10
+; GFX10-NEXT: v_readfirstlane_b32 s10, v7
+; GFX10-NEXT: v_readfirstlane_b32 s11, v8
+; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[8:9], v[9:10]
+; GFX10-NEXT: v_cmp_eq_u64_e64 s4, s[10:11], v[7:8]
+; GFX10-NEXT: s_and_b32 s4, vcc_lo, s4
+; GFX10-NEXT: s_and_saveexec_b32 s4, s4
+; GFX10-NEXT: buffer_load_dwordx2 v[13:14], v4, s[8:11], 0 offen offset:2048
+; GFX10-NEXT: ; implicit-def: $vgpr4
+; GFX10-NEXT: s_waitcnt_depctr 0xffe3
+; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT: s_cbranch_execnz .LBB10_1
+; GFX10-NEXT: ; %bb.2:
+; GFX10-NEXT: s_mov_b32 exec_lo, s6
+; GFX10-NEXT: .LBB10_3: ; %atomicrmw.start
+; GFX10-NEXT: ; =>This Loop Header: Depth=1
+; GFX10-NEXT: ; Child Loop BB10_4 Depth 2
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_lshrrev_b32_sdwa v0, s8, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX10-NEXT: v_add_f64 v[11:12], v[13:14], v[5:6]
+; GFX10-NEXT: s_mov_b32 s6, exec_lo
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: v_add_f32_e32 v0, v0, v3
-; GFX10-NEXT: v_bfe_u32 v4, v0, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v0
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX10-NEXT: v_add3_u32 v4, v4, v0, 0x7fff
-; GFX10-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc_lo
-; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX10-NEXT: v_and_or_b32 v0, v1, s10, v0
-; GFX10-NEXT: v_mov_b32_e32 v5, v1
-; GFX10-NEXT: v_mov_b32_e32 v4, v0
-; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[4:7], 0 offen glc
+; GFX10-NEXT: v_mov_b32_e32 v0, v11
+; GFX10-NEXT: v_mov_b32_e32 v1, v12
+; GFX10-NEXT: v_mov_b32_e32 v2, v13
+; GFX10-NEXT: v_mov_b32_e32 v3, v14
+; GFX10-NEXT: .LBB10_4: ; Parent Loop BB10_3 Depth=1
+; GFX10-NEXT: ; => This Inner Loop Header: Depth=2
+; GFX10-NEXT: v_readfirstlane_b32 s8, v9
+; GFX10-NEXT: v_readfirstlane_b32 s9, v10
+; GFX10-NEXT: v_readfirstlane_b32 s10, v7
+; GFX10-NEXT: v_readfirstlane_b32 s11, v8
+; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[8:9], v[9:10]
+; GFX10-NEXT: v_cmp_eq_u64_e64 s4, s[10:11], v[7:8]
+; GFX10-NEXT: s_and_b32 s4, vcc_lo, s4
+; GFX10-NEXT: s_and_saveexec_b32 s4, s4
; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v15, s[8:11], 0 offen glc
+; GFX10-NEXT: s_waitcnt_depctr 0xffe3
+; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT: s_cbranch_execnz .LBB10_4
+; GFX10-NEXT: ; %bb.5: ; in Loop: Header=BB10_3 Depth=1
+; GFX10-NEXT: s_mov_b32 exec_lo, s6
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[13:14]
+; GFX10-NEXT: v_mov_b32_e32 v14, v1
+; GFX10-NEXT: v_mov_b32_e32 v13, v0
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1
-; GFX10-NEXT: v_mov_b32_e32 v1, v4
-; GFX10-NEXT: s_or_b32 s9, vcc_lo, s9
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s9
-; GFX10-NEXT: s_cbranch_execnz .LBB10_1
-; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s9
-; GFX10-NEXT: s_setpc_b64 s[30:31]
+; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5
+; GFX10-NEXT: s_waitcnt_depctr 0xffe3
+; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5
+; GFX10-NEXT: s_cbranch_execnz .LBB10_3
+; GFX10-NEXT: ; %bb.6: ; %atomicrmw.end
+; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5
+; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset:
+; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: s_addk_i32 s8, 0x200
-; GFX90A-NEXT: s_and_b32 s9, s8, -4
-; GFX90A-NEXT: v_mov_b32_e32 v2, s9
-; GFX90A-NEXT: buffer_load_dword v1, v2, s[4:7], 0 offen
-; GFX90A-NEXT: s_and_b32 s8, s8, 3
-; GFX90A-NEXT: s_lshl_b32 s10, s8, 3
-; GFX90A-NEXT: s_lshl_b32 s8, 0xffff, s10
-; GFX90A-NEXT: s_not_b32 s11, s8
-; GFX90A-NEXT: s_mov_b64 s[8:9], 0
-; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v0
-; GFX90A-NEXT: s_movk_i32 s12, 0x7fff
-; GFX90A-NEXT: .LBB10_1: ; %atomicrmw.start
-; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT: v_mov_b32_e32 v7, v6
+; GFX90A-NEXT: v_mov_b32_e32 v6, v5
+; GFX90A-NEXT: s_mov_b64 s[6:7], exec
+; GFX90A-NEXT: .LBB10_1: ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT: v_readfirstlane_b32 s8, v0
+; GFX90A-NEXT: v_readfirstlane_b32 s9, v1
+; GFX90A-NEXT: v_readfirstlane_b32 s10, v2
+; GFX90A-NEXT: v_readfirstlane_b32 s11, v3
+; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
+; GFX90A-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
+; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_lshrrev_b32_sdwa v0, s10, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX90A-NEXT: v_add_f32_e32 v0, v0, v3
-; GFX90A-NEXT: v_bfe_u32 v4, v0, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v5, 0x400000, v0
-; GFX90A-NEXT: v_add3_u32 v4, v4, v0, s12
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; GFX90A-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc
-; GFX90A-NEXT: v_lshlrev_b32_sdwa v0, s10, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX90A-NEXT: v_and_or_b32 v0, v1, s11, v0
-; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[0:1], v[0:1] op_sel:[0,1]
-; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[4:7], 0 offen glc
+; GFX90A-NEXT: buffer_atomic_add_f64 v[6:7], v4, s[8:11], 0 offen offset:2048 glc
+; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
+; GFX90A-NEXT: ; implicit-def: $vgpr4
+; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: s_cbranch_execnz .LBB10_1
+; GFX90A-NEXT: ; %bb.2:
+; GFX90A-NEXT: s_mov_b64 exec, s[6:7]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v0, v6
+; GFX90A-NEXT: v_mov_b32_e32 v1, v7
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1
-; GFX90A-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX90A-NEXT: v_mov_b32_e32 v1, v4
-; GFX90A-NEXT: s_andn2_b64 exec, exec, s[8:9]
-; GFX90A-NEXT: s_cbranch_execnz .LBB10_1
-; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX90A-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
-; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset:
+; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: s_addk_i32 s8, 0x200
-; GFX908-NEXT: s_and_b32 s9, s8, -4
-; GFX908-NEXT: v_mov_b32_e32 v2, s9
-; GFX908-NEXT: buffer_load_dword v1, v2, s[4:7], 0 offen
-; GFX908-NEXT: s_and_b32 s8, s8, 3
-; GFX908-NEXT: s_lshl_b32 s10, s8, 3
-; GFX908-NEXT: s_lshl_b32 s8, 0xffff, s10
-; GFX908-NEXT: s_not_b32 s11, s8
-; GFX908-NEXT: s_mov_b64 s[8:9], 0
-; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v0
-; GFX908-NEXT: s_movk_i32 s12, 0x7fff
-; GFX908-NEXT: .LBB10_1: ; %atomicrmw.start
-; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT: v_mov_b32_e32 v8, v3
+; GFX908-NEXT: v_mov_b32_e32 v7, v2
+; GFX908-NEXT: v_mov_b32_e32 v10, v1
+; GFX908-NEXT: v_mov_b32_e32 v9, v0
+; GFX908-NEXT: v_add_u32_e32 v15, 0x800, v4
+; GFX908-NEXT: s_mov_b64 s[6:7], exec
+; GFX908-NEXT: .LBB10_1: ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT: v_readfirstlane_b32 s8, v9
+; GFX908-NEXT: v_readfirstlane_b32 s9, v10
+; GFX908-NEXT: v_readfirstlane_b32 s10, v7
+; GFX908-NEXT: v_readfirstlane_b32 s11, v8
+; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[9:10]
+; GFX908-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[7:8]
+; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
+; GFX908-NEXT: s_nop 0
+; GFX908-NEXT: buffer_load_dwordx2 v[13:14], v4, s[8:11], 0 offen offset:2048
+; GFX908-NEXT: ; implicit-def: $vgpr4
+; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5]
+; GFX908-NEXT: s_cbranch_execnz .LBB10_1
+; GFX908-NEXT: ; %bb.2:
+; GFX908-NEXT: s_mov_b64 exec, s[6:7]
+; GFX908-NEXT: s_mov_b64 s[6:7], 0
+; GFX908-NEXT: .LBB10_3: ; %atomicrmw.start
+; GFX908-NEXT: ; =>This Loop Header: Depth=1
+; GFX908-NEXT: ; Child Loop BB10_4 Depth 2
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_lshrrev_b32_sdwa v0, s10, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX908-NEXT: v_add_f32_e32 v0, v0, v3
-; GFX908-NEXT: v_bfe_u32 v4, v0, 16, 1
-; GFX908-NEXT: v_or_b32_e32 v5, 0x400000, v0
-; GFX908-NEXT: v_add3_u32 v4, v4, v0, s12
-; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; GFX908-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc
-; GFX908-NEXT: v_lshlrev_b32_sdwa v0, s10, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX908-NEXT: v_and_or_b32 v0, v1, s11, v0
-; GFX908-NEXT: v_mov_b32_e32 v5, v1
-; GFX908-NEXT: v_mov_b32_e32 v4, v0
-; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[4:7], 0 offen glc
+; GFX908-NEXT: v_add_f64 v[11:12], v[13:14], v[5:6]
+; GFX908-NEXT: s_mov_b64 s[12:13], exec
+; GFX908-NEXT: v_mov_b32_e32 v0, v11
+; GFX908-NEXT: v_mov_b32_e32 v1, v12
+; GFX908-NEXT: v_mov_b32_e32 v2, v13
+; GFX908-NEXT: v_mov_b32_e32 v3, v14
+; GFX908-NEXT: .LBB10_4: ; Parent Loop BB10_3 Depth=1
+; GFX908-NEXT: ; => This Inner Loop Header: Depth=2
+; GFX908-NEXT: v_readfirstlane_b32 s8, v9
+; GFX908-NEXT: v_readfirstlane_b32 s9, v10
+; GFX908-NEXT: v_readfirstlane_b32 s10, v7
+; GFX908-NEXT: v_readfirstlane_b32 s11, v8
+; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[9:10]
+; GFX908-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[7:8]
+; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v15, s[8:11], 0 offen glc
+; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5]
+; GFX908-NEXT: s_cbranch_execnz .LBB10_4
+; GFX908-NEXT: ; %bb.5: ; in Loop: Header=BB10_3 Depth=1
+; GFX908-NEXT: s_mov_b64 exec, s[12:13]
; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[13:14]
+; GFX908-NEXT: v_mov_b32_e32 v14, v1
+; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GFX908-NEXT: v_mov_b32_e32 v13, v0
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1
-; GFX908-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX908-NEXT: v_mov_b32_e32 v1, v4
-; GFX908-NEXT: s_andn2_b64 exec, exec, s[8:9]
-; GFX908-NEXT: s_cbranch_execnz .LBB10_1
-; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX908-NEXT: s_or_b64 exec, exec, s[8:9]
+; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7]
+; GFX908-NEXT: s_cbranch_execnz .LBB10_3
+; GFX908-NEXT: ; %bb.6: ; %atomicrmw.end
+; GFX908-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
-; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset:
+; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: s_addk_i32 s8, 0x200
-; GFX8-NEXT: s_and_b32 s9, s8, -4
-; GFX8-NEXT: v_mov_b32_e32 v2, s9
-; GFX8-NEXT: buffer_load_dword v1, v2, s[4:7], 0 offen
-; GFX8-NEXT: s_and_b32 s8, s8, 3
-; GFX8-NEXT: s_lshl_b32 s10, s8, 3
-; GFX8-NEXT: s_lshl_b32 s8, 0xffff, s10
-; GFX8-NEXT: s_not_b32 s11, s8
-; GFX8-NEXT: s_mov_b64 s[8:9], 0
-; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v0
-; GFX8-NEXT: .LBB10_1: ; %atomicrmw.start
-; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT: v_mov_b32_e32 v0, s10
+; GFX8-NEXT: v_mov_b32_e32 v8, v3
+; GFX8-NEXT: v_mov_b32_e32 v7, v2
+; GFX8-NEXT: v_mov_b32_e32 v10, v1
+; GFX8-NEXT: v_mov_b32_e32 v9, v0
+; GFX8-NEXT: v_add_u32_e32 v15, vcc, 0x800, v4
+; GFX8-NEXT: s_mov_b64 s[6:7], exec
+; GFX8-NEXT: .LBB10_1: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT: v_readfirstlane_b32 s8, v9
+; GFX8-NEXT: v_readfirstlane_b32 s9, v10
+; GFX8-NEXT: v_readfirstlane_b32 s10, v7
+; GFX8-NEXT: v_readfirstlane_b32 s11, v8
+; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[9:10]
+; GFX8-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[7:8]
+; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
+; GFX8-NEXT: s_nop 0
+; GFX8-NEXT: buffer_load_dwordx2 v[13:14], v4, s[8:11], 0 offen offset:2048
+; GFX8-NEXT: ; implicit-def: $vgpr4
+; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5]
+; GFX8-NEXT: s_cbranch_execnz .LBB10_1
+; GFX8-NEXT: ; %bb.2:
+; GFX8-NEXT: s_mov_b64 exec, s[6:7]
+; GFX8-NEXT: s_mov_b64 s[6:7], 0
+; GFX8-NEXT: .LBB10_3: ; %atomicrmw.start
+; GFX8-NEXT: ; =>This Loop Header: Depth=1
+; GFX8-NEXT: ; Child Loop BB10_4 Depth 2
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_lshrrev_b32_sdwa v5, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT: v_add_f32_e32 v5, v5, v3
-; GFX8-NEXT: v_bfe_u32 v6, v5, 16, 1
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v5
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6
-; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v5
-; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX8-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc
-; GFX8-NEXT: v_and_b32_e32 v4, s11, v1
-; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-NEXT: v_or_b32_e32 v0, v4, v0
-; GFX8-NEXT: v_mov_b32_e32 v5, v1
-; GFX8-NEXT: v_mov_b32_e32 v4, v0
-; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[4:7], 0 offen glc
+; GFX8-NEXT: v_add_f64 v[11:12], v[13:14], v[5:6]
+; GFX8-NEXT: s_mov_b64 s[12:13], exec
+; GFX8-NEXT: v_mov_b32_e32 v0, v11
+; GFX8-NEXT: v_mov_b32_e32 v1, v12
+; GFX8-NEXT: v_mov_b32_e32 v2, v13
+; GFX8-NEXT: v_mov_b32_e32 v3, v14
+; GFX8-NEXT: .LBB10_4: ; Parent Loop BB10_3 Depth=1
+; GFX8-NEXT: ; => This Inner Loop Header: Depth=2
+; GFX8-NEXT: v_readfirstlane_b32 s8, v9
+; GFX8-NEXT: v_readfirstlane_b32 s9, v10
+; GFX8-NEXT: v_readfirstlane_b32 s10, v7
+; GFX8-NEXT: v_readfirstlane_b32 s11, v8
+; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[9:10]
+; GFX8-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[7:8]
+; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v15, s[8:11], 0 offen glc
+; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5]
+; GFX8-NEXT: s_cbranch_execnz .LBB10_4
+; GFX8-NEXT: ; %bb.5: ; in Loop: Header=BB10_3 Depth=1
+; GFX8-NEXT: s_mov_b64 exec, s[12:13]
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[13:14]
+; GFX8-NEXT: v_mov_b32_e32 v14, v1
+; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GFX8-NEXT: v_mov_b32_e32 v13, v0
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1
-; GFX8-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX8-NEXT: v_mov_b32_e32 v1, v4
-; GFX8-NEXT: s_andn2_b64 exec, exec, s[8:9]
-; GFX8-NEXT: s_cbranch_execnz .LBB10_1
-; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX8-NEXT: s_or_b64 exec, exec, s[8:9]
+; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7]
+; GFX8-NEXT: s_cbranch_execnz .LBB10_3
+; GFX8-NEXT: ; %bb.6: ; %atomicrmw.end
+; GFX8-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset:
+; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: s_addk_i32 s8, 0x200
-; GFX7-NEXT: s_and_b32 s9, s8, -4
-; GFX7-NEXT: v_mov_b32_e32 v2, s9
-; GFX7-NEXT: buffer_load_dword v1, v2, s[4:7], 0 offen
-; GFX7-NEXT: s_and_b32 s8, s8, 3
-; GFX7-NEXT: s_lshl_b32 s10, s8, 3
-; GFX7-NEXT: s_lshl_b32 s8, 0xffff, s10
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT: s_not_b32 s11, s8
-; GFX7-NEXT: s_mov_b64 s[8:9], 0
-; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
-; GFX7-NEXT: .LBB10_1: ; %atomicrmw.start
-; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: v_mov_b32_e32 v8, v3
+; GFX7-NEXT: v_mov_b32_e32 v7, v2
+; GFX7-NEXT: v_mov_b32_e32 v10, v1
+; GFX7-NEXT: v_mov_b32_e32 v9, v0
+; GFX7-NEXT: v_add_i32_e32 v15, vcc, 0x800, v4
+; GFX7-NEXT: s_mov_b64 s[6:7], exec
+; GFX7-NEXT: .LBB10_1: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: v_readfirstlane_b32 s8, v9
+; GFX7-NEXT: v_readfirstlane_b32 s9, v10
+; GFX7-NEXT: v_readfirstlane_b32 s10, v7
+; GFX7-NEXT: v_readfirstlane_b32 s11, v8
+; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[9:10]
+; GFX7-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[7:8]
+; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
+; GFX7-NEXT: buffer_load_dwordx2 v[13:14], v4, s[8:11], 0 offen offset:2048
+; GFX7-NEXT: ; implicit-def: $vgpr4
+; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5]
+; GFX7-NEXT: s_cbranch_execnz .LBB10_1
+; GFX7-NEXT: ; %bb.2:
+; GFX7-NEXT: s_mov_b64 exec, s[6:7]
+; GFX7-NEXT: s_mov_b64 s[6:7], 0
+; GFX7-NEXT: .LBB10_3: ; %atomicrmw.start
+; GFX7-NEXT: ; =>This Loop Header: Depth=1
+; GFX7-NEXT: ; Child Loop BB10_4 Depth 2
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, s10, v1
-; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX7-NEXT: v_add_f32_e32 v0, v0, v3
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX7-NEXT: v_and_b32_e32 v4, s11, v1
-; GFX7-NEXT: v_lshlrev_b32_e32 v0, s10, v0
-; GFX7-NEXT: v_or_b32_e32 v0, v4, v0
-; GFX7-NEXT: v_mov_b32_e32 v5, v1
-; GFX7-NEXT: v_mov_b32_e32 v4, v0
-; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[4:7], 0 offen glc
+; GFX7-NEXT: v_add_f64 v[11:12], v[13:14], v[5:6]
+; GFX7-NEXT: s_mov_b64 s[12:13], exec
+; GFX7-NEXT: v_mov_b32_e32 v0, v11
+; GFX7-NEXT: v_mov_b32_e32 v1, v12
+; GFX7-NEXT: v_mov_b32_e32 v2, v13
+; GFX7-NEXT: v_mov_b32_e32 v3, v14
+; GFX7-NEXT: .LBB10_4: ; Parent Loop BB10_3 Depth=1
+; GFX7-NEXT: ; => This Inner Loop Header: Depth=2
+; GFX7-NEXT: v_readfirstlane_b32 s8, v9
+; GFX7-NEXT: v_readfirstlane_b32 s9, v10
+; GFX7-NEXT: v_readfirstlane_b32 s10, v7
+; GFX7-NEXT: v_readfirstlane_b32 s11, v8
+; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[9:10]
+; GFX7-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[7:8]
+; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v15, s[8:11], 0 offen glc
+; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5]
+; GFX7-NEXT: s_cbranch_execnz .LBB10_4
+; GFX7-NEXT: ; %bb.5: ; in Loop: Header=BB10_3 Depth=1
+; GFX7-NEXT: s_mov_b64 exec, s[12:13]
; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[13:14]
+; GFX7-NEXT: v_mov_b32_e32 v14, v1
+; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v13, v0
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1
-; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT: v_mov_b32_e32 v1, v4
-; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT: s_cbranch_execnz .LBB10_1
-; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
+; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7]
+; GFX7-NEXT: s_cbranch_execnz .LBB10_3
+; GFX7-NEXT: ; %bb.6: ; %atomicrmw.end
+; GFX7-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
-; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset:
+; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__waterfall__amdgpu_no_fine_grained_memory:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT: s_addk_i32 s8, 0x200
-; GFX6-NEXT: s_and_b32 s9, s8, -4
-; GFX6-NEXT: v_mov_b32_e32 v2, s9
-; GFX6-NEXT: buffer_load_dword v1, v2, s[4:7], 0 offen
-; GFX6-NEXT: s_and_b32 s8, s8, 3
-; GFX6-NEXT: s_lshl_b32 s10, s8, 3
-; GFX6-NEXT: s_lshl_b32 s8, 0xffff, s10
-; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX6-NEXT: s_not_b32 s11, s8
-; GFX6-NEXT: s_mov_b64 s[8:9], 0
-; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
-; GFX6-NEXT: .LBB10_1: ; %atomicrmw.start
-; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX6-NEXT: v_mov_b32_e32 v8, v3
+; GFX6-NEXT: v_mov_b32_e32 v7, v2
+; GFX6-NEXT: v_mov_b32_e32 v10, v1
+; GFX6-NEXT: v_mov_b32_e32 v9, v0
+; GFX6-NEXT: v_add_i32_e32 v15, vcc, 0x800, v4
+; GFX6-NEXT: s_mov_b64 s[6:7], exec
+; GFX6-NEXT: .LBB10_1: ; =>This Inner Loop Header: Depth=1
+; GFX6-NEXT: v_readfirstlane_b32 s8, v9
+; GFX6-NEXT: v_readfirstlane_b32 s9, v10
+; GFX6-NEXT: v_readfirstlane_b32 s10, v7
+; GFX6-NEXT: v_readfirstlane_b32 s11, v8
+; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[9:10]
+; GFX6-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[7:8]
+; GFX6-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
+; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
+; GFX6-NEXT: buffer_load_dwordx2 v[13:14], v4, s[8:11], 0 offen offset:2048
+; GFX6-NEXT: ; implicit-def: $vgpr4
+; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5]
+; GFX6-NEXT: s_cbranch_execnz .LBB10_1
+; GFX6-NEXT: ; %bb.2:
+; GFX6-NEXT: s_mov_b64 exec, s[6:7]
+; GFX6-NEXT: s_mov_b64 s[6:7], 0
+; GFX6-NEXT: .LBB10_3: ; %atomicrmw.start
+; GFX6-NEXT: ; =>This Loop Header: Depth=1
+; GFX6-NEXT: ; Child Loop BB10_4 Depth 2
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_lshrrev_b32_e32 v0, s10, v1
-; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX6-NEXT: v_add_f32_e32 v0, v0, v3
-; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX6-NEXT: v_add_f64 v[11:12], v[13:14], v[5:6]
+; GFX6-NEXT: s_mov_b64 s[12:13], exec
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v4, s11, v1
-; GFX6-NEXT: v_lshlrev_b32_e32 v0, s10, v0
-; GFX6-NEXT: v_or_b32_e32 v0, v4, v0
-; GFX6-NEXT: v_mov_b32_e32 v5, v1
-; GFX6-NEXT: v_mov_b32_e32 v4, v0
-; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[4:7], 0 offen glc
+; GFX6-NEXT: v_mov_b32_e32 v0, v11
+; GFX6-NEXT: v_mov_b32_e32 v1, v12
+; GFX6-NEXT: v_mov_b32_e32 v2, v13
+; GFX6-NEXT: v_mov_b32_e32 v3, v14
+; GFX6-NEXT: .LBB10_4: ; Parent Loop BB10_3 Depth=1
+; GFX6-NEXT: ; => This Inner Loop Header: Depth=2
+; GFX6-NEXT: v_readfirstlane_b32 s8, v9
+; GFX6-NEXT: v_readfirstlane_b32 s9, v10
+; GFX6-NEXT: v_readfirstlane_b32 s10, v7
+; GFX6-NEXT: v_readfirstlane_b32 s11, v8
+; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[9:10]
+; GFX6-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[7:8]
+; GFX6-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
+; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v15, s[8:11], 0 offen glc
+; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5]
+; GFX6-NEXT: s_cbranch_execnz .LBB10_4
+; GFX6-NEXT: ; %bb.5: ; in Loop: Header=BB10_3 Depth=1
+; GFX6-NEXT: s_mov_b64 exec, s[12:13]
; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[13:14]
+; GFX6-NEXT: v_mov_b32_e32 v14, v1
+; GFX6-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v13, v0
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1
-; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT: v_mov_b32_e32 v1, v4
-; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT: s_cbranch_execnz .LBB10_1
-; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
+; GFX6-NEXT: s_andn2_b64 exec, exec, s[6:7]
+; GFX6-NEXT: s_cbranch_execnz .LBB10_3
+; GFX6-NEXT: ; %bb.6: ; %atomicrmw.end
+; GFX6-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
- %gep = getelementptr bfloat, ptr addrspace(7) %ptr, i32 256
- %unused = atomicrmw fadd ptr addrspace(7) %gep, bfloat %val syncscope("agent") seq_cst
- ret void
+ %gep = getelementptr double, ptr addrspace(7) %ptr, i32 256
+ %result = atomicrmw fadd ptr addrspace(7) %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+ ret double %result
}
-define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall(ptr addrspace(7) %ptr, bfloat %val) #0 {
-; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall:
+define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_remote_memory(ptr addrspace(7) inreg %ptr, double %val) #0 {
+; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_remote_memory:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: s_wait_expcnt 0x0
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_add_nc_u32_e32 v4, 0x200, v4
-; GFX12-NEXT: s_mov_b32 s1, exec_lo
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_and_b32_e32 v6, 3, v4
-; GFX12-NEXT: v_and_b32_e32 v8, -4, v4
-; GFX12-NEXT: v_lshlrev_b32_e32 v7, 3, v6
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e64 v6, v7, 0xffff
-; GFX12-NEXT: v_not_b32_e32 v9, v6
-; GFX12-NEXT: .LBB11_1: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: v_readfirstlane_b32 s4, v0
-; GFX12-NEXT: v_readfirstlane_b32 s5, v1
-; GFX12-NEXT: v_readfirstlane_b32 s6, v2
-; GFX12-NEXT: v_readfirstlane_b32 s7, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
-; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_and_saveexec_b32 s0, s0
-; GFX12-NEXT: buffer_load_b32 v6, v8, s[4:7], null offen
-; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB11_1
-; GFX12-NEXT: ; %bb.2:
-; GFX12-NEXT: s_mov_b32 exec_lo, s1
-; GFX12-NEXT: v_lshlrev_b32_e32 v10, 16, v5
-; GFX12-NEXT: s_mov_b32 s1, 0
-; GFX12-NEXT: .LBB11_3: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Loop Header: Depth=1
-; GFX12-NEXT: ; Child Loop BB11_4 Depth 2
+; GFX12-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
+; GFX12-NEXT: v_mov_b32_e32 v0, s4
+; GFX12-NEXT: s_addk_co_i32 s4, 0x800
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: v_mov_b32_e32 v6, s4
+; GFX12-NEXT: s_mov_b32 s4, 0
+; GFX12-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], null offen offset:2048
+; GFX12-NEXT: .LBB11_1: ; %atomicrmw.start
+; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_lshrrev_b32_e32 v4, v7, v6
-; GFX12-NEXT: s_mov_b32 s2, exec_lo
+; GFX12-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v4
-; GFX12-NEXT: v_add_f32_e32 v4, v4, v10
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-NEXT: v_bfe_u32 v5, v4, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v11, 0x400000, v4
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX12-NEXT: v_add3_u32 v5, v5, v4, 0x7fff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_cndmask_b32_e32 v4, v5, v11, vcc_lo
-; GFX12-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e32 v4, v7, v4
-; GFX12-NEXT: v_and_or_b32 v5, v6, v9, v4
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_mov_b32_e32 v4, v5
-; GFX12-NEXT: v_mov_b32_e32 v5, v6
-; GFX12-NEXT: .LBB11_4: ; Parent Loop BB11_3 Depth=1
-; GFX12-NEXT: ; => This Inner Loop Header: Depth=2
-; GFX12-NEXT: v_readfirstlane_b32 s4, v0
-; GFX12-NEXT: v_readfirstlane_b32 s5, v1
-; GFX12-NEXT: v_readfirstlane_b32 s6, v2
-; GFX12-NEXT: v_readfirstlane_b32 s7, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
-; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_and_saveexec_b32 s0, s0
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v8, s[4:7], null offen th:TH_ATOMIC_RETURN
-; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB11_4
-; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB11_3 Depth=1
-; GFX12-NEXT: s_mov_b32 exec_lo, s2
+; GFX12-NEXT: v_add_f64_e32 v[7:8], v[9:10], v[4:5]
+; GFX12-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
+; GFX12-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
+; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], null offen th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6
-; GFX12-NEXT: v_mov_b32_e32 v6, v4
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
+; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10]
+; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT: s_cbranch_execnz .LBB11_3
-; GFX12-NEXT: ; %bb.6: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX12-NEXT: v_lshrrev_b32_e32 v0, v7, v4
+; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX12-NEXT: s_cbranch_execnz .LBB11_1
+; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
-; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall:
+; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_remote_memory:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_add_u32_e32 v4, 0x200, v4
-; GFX940-NEXT: v_and_b32_e32 v9, -4, v4
-; GFX940-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX940-NEXT: v_lshlrev_b32_e32 v8, 3, v4
-; GFX940-NEXT: s_mov_b32 s0, 0xffff
-; GFX940-NEXT: v_lshlrev_b32_e64 v4, v8, s0
-; GFX940-NEXT: v_not_b32_e32 v10, v4
-; GFX940-NEXT: s_mov_b64 s[2:3], exec
-; GFX940-NEXT: .LBB11_1: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: v_readfirstlane_b32 s4, v0
-; GFX940-NEXT: v_readfirstlane_b32 s5, v1
-; GFX940-NEXT: v_readfirstlane_b32 s6, v2
-; GFX940-NEXT: v_readfirstlane_b32 s7, v3
-; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1]
-; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
-; GFX940-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT: s_and_saveexec_b64 s[0:1], s[0:1]
-; GFX940-NEXT: buffer_load_dword v7, v9, s[4:7], 0 offen
-; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1]
-; GFX940-NEXT: s_cbranch_execnz .LBB11_1
-; GFX940-NEXT: ; %bb.2:
-; GFX940-NEXT: s_mov_b64 exec, s[2:3]
-; GFX940-NEXT: s_mov_b64 s[2:3], 0
-; GFX940-NEXT: v_lshlrev_b32_e32 v11, 16, v5
-; GFX940-NEXT: s_movk_i32 s10, 0x7fff
-; GFX940-NEXT: .LBB11_3: ; %atomicrmw.start
-; GFX940-NEXT: ; =>This Loop Header: Depth=1
-; GFX940-NEXT: ; Child Loop BB11_4 Depth 2
-; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: v_lshrrev_b32_sdwa v4, v8, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX940-NEXT: s_mov_b64 s[8:9], exec
-; GFX940-NEXT: v_add_f32_e32 v4, v4, v11
-; GFX940-NEXT: v_bfe_u32 v5, v4, 16, 1
-; GFX940-NEXT: v_add3_u32 v5, v5, v4, s10
-; GFX940-NEXT: v_or_b32_e32 v6, 0x400000, v4
-; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
+; GFX940-NEXT: v_mov_b32_e32 v2, s4
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc
-; GFX940-NEXT: v_lshlrev_b32_sdwa v4, v8, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX940-NEXT: v_and_or_b32 v6, v7, v10, v4
-; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[6:7]
-; GFX940-NEXT: .LBB11_4: ; Parent Loop BB11_3 Depth=1
-; GFX940-NEXT: ; => This Inner Loop Header: Depth=2
-; GFX940-NEXT: v_readfirstlane_b32 s4, v0
-; GFX940-NEXT: v_readfirstlane_b32 s5, v1
-; GFX940-NEXT: v_readfirstlane_b32 s6, v2
-; GFX940-NEXT: v_readfirstlane_b32 s7, v3
-; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1]
-; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
-; GFX940-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT: s_and_saveexec_b64 s[0:1], s[0:1]
+; GFX940-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 offen offset:2048 sc0
; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v9, s[4:7], 0 offen sc0
-; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1]
-; GFX940-NEXT: s_cbranch_execnz .LBB11_4
-; GFX940-NEXT: ; %bb.5: ; in Loop: Header=BB11_3 Depth=1
-; GFX940-NEXT: s_mov_b64 exec, s[8:9]
-; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7
-; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX940-NEXT: v_mov_b32_e32 v7, v4
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3]
-; GFX940-NEXT: s_cbranch_execnz .LBB11_3
-; GFX940-NEXT: ; %bb.6: ; %atomicrmw.end
-; GFX940-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX940-NEXT: v_lshrrev_b32_e32 v0, v8, v4
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall:
+; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_remote_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x200, v4
-; GFX11-NEXT: s_mov_b32 s1, 0
-; GFX11-NEXT: s_mov_b32 s2, exec_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_and_b32_e32 v6, 3, v4
-; GFX11-NEXT: v_and_b32_e32 v8, -4, v4
-; GFX11-NEXT: v_lshlrev_b32_e32 v7, 3, v6
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e64 v6, v7, 0xffff
-; GFX11-NEXT: v_not_b32_e32 v9, v6
-; GFX11-NEXT: .LBB11_1: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: v_readfirstlane_b32 s4, v0
-; GFX11-NEXT: v_readfirstlane_b32 s5, v1
-; GFX11-NEXT: v_readfirstlane_b32 s6, v2
-; GFX11-NEXT: v_readfirstlane_b32 s7, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
-; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_and_saveexec_b32 s0, s0
-; GFX11-NEXT: buffer_load_b32 v6, v8, s[4:7], 0 offen
-; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB11_1
-; GFX11-NEXT: ; %bb.2:
-; GFX11-NEXT: s_mov_b32 exec_lo, s2
-; GFX11-NEXT: v_lshlrev_b32_e32 v10, 16, v5
-; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
-; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB11_3: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Loop Header: Depth=1
-; GFX11-NEXT: ; Child Loop BB11_4 Depth 2
+; GFX11-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
+; GFX11-NEXT: v_mov_b32_e32 v0, s4
+; GFX11-NEXT: s_addk_i32 s4, 0x800
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: v_mov_b32_e32 v6, s4
+; GFX11-NEXT: s_mov_b32 s4, 0
+; GFX11-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], 0 offen offset:2048
+; GFX11-NEXT: .LBB11_1: ; %atomicrmw.start
+; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v4, v7, v6
-; GFX11-NEXT: s_mov_b32 s2, exec_lo
+; GFX11-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v4
-; GFX11-NEXT: v_add_f32_e32 v4, v4, v10
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_bfe_u32 v5, v4, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v11, 0x400000, v4
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT: v_add3_u32 v5, v5, v4, 0x7fff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e32 v4, v5, v11, vcc_lo
-; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v4, v7, v4
-; GFX11-NEXT: v_and_or_b32 v5, v6, v9, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_mov_b32_e32 v4, v5
-; GFX11-NEXT: v_mov_b32_e32 v5, v6
-; GFX11-NEXT: .LBB11_4: ; Parent Loop BB11_3 Depth=1
-; GFX11-NEXT: ; => This Inner Loop Header: Depth=2
-; GFX11-NEXT: v_readfirstlane_b32 s4, v0
-; GFX11-NEXT: v_readfirstlane_b32 s5, v1
-; GFX11-NEXT: v_readfirstlane_b32 s6, v2
-; GFX11-NEXT: v_readfirstlane_b32 s7, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
-; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_and_saveexec_b32 s0, s0
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v8, s[4:7], 0 offen glc
-; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB11_4
-; GFX11-NEXT: ; %bb.5: ; in Loop: Header=BB11_3 Depth=1
-; GFX11-NEXT: s_mov_b32 exec_lo, s2
+; GFX11-NEXT: v_add_f64 v[7:8], v[9:10], v[4:5]
+; GFX11-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
+; GFX11-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
+; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], 0 offen glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6
-; GFX11-NEXT: v_mov_b32_e32 v6, v4
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
+; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10]
+; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT: s_cbranch_execnz .LBB11_3
-; GFX11-NEXT: ; %bb.6: ; %atomicrmw.end
-; GFX11-NEXT: s_set_inst_prefetch_distance 0x2
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, v7, v4
+; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX11-NEXT: s_cbranch_execnz .LBB11_1
+; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
-; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall:
+; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_remote_memory:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_add_nc_u32_e32 v4, 0x200, v4
-; GFX10-NEXT: s_mov_b32 s5, 0
-; GFX10-NEXT: s_mov_b32 s6, exec_lo
-; GFX10-NEXT: v_and_b32_e32 v6, 3, v4
-; GFX10-NEXT: v_and_b32_e32 v8, -4, v4
-; GFX10-NEXT: v_lshlrev_b32_e32 v7, 3, v6
-; GFX10-NEXT: v_lshlrev_b32_e64 v6, v7, 0xffff
-; GFX10-NEXT: v_not_b32_e32 v9, v6
-; GFX10-NEXT: .LBB11_1: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: v_readfirstlane_b32 s8, v0
-; GFX10-NEXT: v_readfirstlane_b32 s9, v1
-; GFX10-NEXT: v_readfirstlane_b32 s10, v2
-; GFX10-NEXT: v_readfirstlane_b32 s11, v3
-; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[8:9], v[0:1]
-; GFX10-NEXT: v_cmp_eq_u64_e64 s4, s[10:11], v[2:3]
-; GFX10-NEXT: s_and_b32 s4, vcc_lo, s4
-; GFX10-NEXT: s_and_saveexec_b32 s4, s4
-; GFX10-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen
-; GFX10-NEXT: s_waitcnt_depctr 0xffe3
-; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: s_cbranch_execnz .LBB11_1
-; GFX10-NEXT: ; %bb.2:
-; GFX10-NEXT: s_mov_b32 exec_lo, s6
-; GFX10-NEXT: v_lshlrev_b32_e32 v10, 16, v5
-; GFX10-NEXT: .LBB11_3: ; %atomicrmw.start
-; GFX10-NEXT: ; =>This Loop Header: Depth=1
-; GFX10-NEXT: ; Child Loop BB11_4 Depth 2
+; GFX10-NEXT: v_mov_b32_e32 v4, v0
+; GFX10-NEXT: v_mov_b32_e32 v0, s8
+; GFX10-NEXT: v_mov_b32_e32 v5, v1
+; GFX10-NEXT: s_addk_i32 s8, 0x800
+; GFX10-NEXT: v_mov_b32_e32 v6, s8
+; GFX10-NEXT: buffer_load_dwordx2 v[0:1], v0, s[4:7], 0 offen offset:2048
+; GFX10-NEXT: s_mov_b32 s8, 0
+; GFX10-NEXT: .LBB11_1: ; %atomicrmw.start
+; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_lshrrev_b32_sdwa v4, v7, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX10-NEXT: s_mov_b32 s6, exec_lo
+; GFX10-NEXT: v_mov_b32_e32 v10, v1
+; GFX10-NEXT: v_mov_b32_e32 v9, v0
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: v_add_f32_e32 v4, v4, v10
-; GFX10-NEXT: v_bfe_u32 v5, v4, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v11, 0x400000, v4
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX10-NEXT: v_add3_u32 v5, v5, v4, 0x7fff
-; GFX10-NEXT: v_cndmask_b32_e32 v4, v5, v11, vcc_lo
-; GFX10-NEXT: v_lshlrev_b32_sdwa v4, v7, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX10-NEXT: v_and_or_b32 v5, v6, v9, v4
-; GFX10-NEXT: v_mov_b32_e32 v4, v5
-; GFX10-NEXT: v_mov_b32_e32 v5, v6
-; GFX10-NEXT: .LBB11_4: ; Parent Loop BB11_3 Depth=1
-; GFX10-NEXT: ; => This Inner Loop Header: Depth=2
-; GFX10-NEXT: v_readfirstlane_b32 s8, v0
-; GFX10-NEXT: v_readfirstlane_b32 s9, v1
-; GFX10-NEXT: v_readfirstlane_b32 s10, v2
-; GFX10-NEXT: v_readfirstlane_b32 s11, v3
-; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[8:9], v[0:1]
-; GFX10-NEXT: v_cmp_eq_u64_e64 s4, s[10:11], v[2:3]
-; GFX10-NEXT: s_and_b32 s4, vcc_lo, s4
-; GFX10-NEXT: s_and_saveexec_b32 s4, s4
-; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc
-; GFX10-NEXT: s_waitcnt_depctr 0xffe3
-; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: s_cbranch_execnz .LBB11_4
-; GFX10-NEXT: ; %bb.5: ; in Loop: Header=BB11_3 Depth=1
-; GFX10-NEXT: s_mov_b32 exec_lo, s6
+; GFX10-NEXT: v_add_f64 v[7:8], v[9:10], v[4:5]
+; GFX10-NEXT: v_mov_b32_e32 v0, v7
+; GFX10-NEXT: v_mov_b32_e32 v1, v8
+; GFX10-NEXT: v_mov_b32_e32 v2, v9
+; GFX10-NEXT: v_mov_b32_e32 v3, v10
+; GFX10-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[4:7], 0 offen glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6
-; GFX10-NEXT: v_mov_b32_e32 v6, v4
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5
-; GFX10-NEXT: s_waitcnt_depctr 0xffe3
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5
-; GFX10-NEXT: s_cbranch_execnz .LBB11_3
-; GFX10-NEXT: ; %bb.6: ; %atomicrmw.end
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5
-; GFX10-NEXT: v_lshrrev_b32_e32 v0, v7, v4
+; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10]
+; GFX10-NEXT: s_or_b32 s8, vcc_lo, s8
+; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s8
+; GFX10-NEXT: s_cbranch_execnz .LBB11_1
+; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s8
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall:
+; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_remote_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_add_u32_e32 v4, 0x200, v4
-; GFX90A-NEXT: v_and_b32_e32 v9, -4, v4
-; GFX90A-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX90A-NEXT: v_lshlrev_b32_e32 v8, 3, v4
-; GFX90A-NEXT: s_mov_b32 s4, 0xffff
-; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v8, s4
-; GFX90A-NEXT: v_not_b32_e32 v10, v4
-; GFX90A-NEXT: s_mov_b64 s[6:7], exec
-; GFX90A-NEXT: .LBB11_1: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: v_readfirstlane_b32 s8, v0
-; GFX90A-NEXT: v_readfirstlane_b32 s9, v1
-; GFX90A-NEXT: v_readfirstlane_b32 s10, v2
-; GFX90A-NEXT: v_readfirstlane_b32 s11, v3
-; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
-; GFX90A-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
-; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
-; GFX90A-NEXT: s_nop 0
-; GFX90A-NEXT: buffer_load_dword v7, v9, s[8:11], 0 offen
-; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: s_cbranch_execnz .LBB11_1
-; GFX90A-NEXT: ; %bb.2:
-; GFX90A-NEXT: s_mov_b64 exec, s[6:7]
-; GFX90A-NEXT: s_mov_b64 s[6:7], 0
-; GFX90A-NEXT: v_lshlrev_b32_e32 v11, 16, v5
-; GFX90A-NEXT: s_movk_i32 s14, 0x7fff
-; GFX90A-NEXT: .LBB11_3: ; %atomicrmw.start
-; GFX90A-NEXT: ; =>This Loop Header: Depth=1
-; GFX90A-NEXT: ; Child Loop BB11_4 Depth 2
-; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_lshrrev_b32_sdwa v4, v8, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX90A-NEXT: v_add_f32_e32 v4, v4, v11
-; GFX90A-NEXT: v_bfe_u32 v5, v4, 16, 1
-; GFX90A-NEXT: v_add3_u32 v5, v5, v4, s14
-; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v4
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
-; GFX90A-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc
-; GFX90A-NEXT: v_lshlrev_b32_sdwa v4, v8, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX90A-NEXT: v_and_or_b32 v6, v7, v10, v4
-; GFX90A-NEXT: s_mov_b64 s[12:13], exec
-; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1]
-; GFX90A-NEXT: .LBB11_4: ; Parent Loop BB11_3 Depth=1
-; GFX90A-NEXT: ; => This Inner Loop Header: Depth=2
-; GFX90A-NEXT: v_readfirstlane_b32 s8, v0
-; GFX90A-NEXT: v_readfirstlane_b32 s9, v1
-; GFX90A-NEXT: v_readfirstlane_b32 s10, v2
-; GFX90A-NEXT: v_readfirstlane_b32 s11, v3
-; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
-; GFX90A-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
-; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
-; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v9, s[8:11], 0 offen glc
-; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: s_cbranch_execnz .LBB11_4
-; GFX90A-NEXT: ; %bb.5: ; in Loop: Header=BB11_3 Depth=1
-; GFX90A-NEXT: s_mov_b64 exec, s[12:13]
+; GFX90A-NEXT: v_mov_b32_e32 v2, s8
+; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[4:7], 0 offen offset:2048 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7
-; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX90A-NEXT: v_mov_b32_e32 v7, v4
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
-; GFX90A-NEXT: s_cbranch_execnz .LBB11_3
-; GFX90A-NEXT: ; %bb.6: ; %atomicrmw.end
-; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v8, v4
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
-; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall:
+; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_remote_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_add_u32_e32 v4, 0x200, v4
-; GFX908-NEXT: v_and_b32_e32 v8, -4, v4
-; GFX908-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX908-NEXT: v_lshlrev_b32_e32 v7, 3, v4
-; GFX908-NEXT: s_mov_b32 s4, 0xffff
-; GFX908-NEXT: v_lshlrev_b32_e64 v4, v7, s4
-; GFX908-NEXT: v_not_b32_e32 v9, v4
-; GFX908-NEXT: s_mov_b64 s[6:7], exec
-; GFX908-NEXT: .LBB11_1: ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT: v_readfirstlane_b32 s8, v0
-; GFX908-NEXT: v_readfirstlane_b32 s9, v1
-; GFX908-NEXT: v_readfirstlane_b32 s10, v2
-; GFX908-NEXT: v_readfirstlane_b32 s11, v3
-; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
-; GFX908-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
-; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
-; GFX908-NEXT: s_nop 0
-; GFX908-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen
-; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5]
-; GFX908-NEXT: s_cbranch_execnz .LBB11_1
-; GFX908-NEXT: ; %bb.2:
-; GFX908-NEXT: s_mov_b64 exec, s[6:7]
-; GFX908-NEXT: s_mov_b64 s[6:7], 0
-; GFX908-NEXT: v_lshlrev_b32_e32 v10, 16, v5
-; GFX908-NEXT: s_movk_i32 s14, 0x7fff
-; GFX908-NEXT: .LBB11_3: ; %atomicrmw.start
-; GFX908-NEXT: ; =>This Loop Header: Depth=1
-; GFX908-NEXT: ; Child Loop BB11_4 Depth 2
-; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_lshrrev_b32_sdwa v4, v7, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX908-NEXT: v_add_f32_e32 v4, v4, v10
-; GFX908-NEXT: v_bfe_u32 v5, v4, 16, 1
-; GFX908-NEXT: v_add3_u32 v5, v5, v4, s14
-; GFX908-NEXT: v_or_b32_e32 v11, 0x400000, v4
-; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
-; GFX908-NEXT: v_cndmask_b32_e32 v4, v5, v11, vcc
-; GFX908-NEXT: v_lshlrev_b32_sdwa v4, v7, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX908-NEXT: v_and_or_b32 v5, v6, v9, v4
-; GFX908-NEXT: v_mov_b32_e32 v4, v5
-; GFX908-NEXT: s_mov_b64 s[12:13], exec
-; GFX908-NEXT: v_mov_b32_e32 v5, v6
-; GFX908-NEXT: .LBB11_4: ; Parent Loop BB11_3 Depth=1
-; GFX908-NEXT: ; => This Inner Loop Header: Depth=2
-; GFX908-NEXT: v_readfirstlane_b32 s8, v0
-; GFX908-NEXT: v_readfirstlane_b32 s9, v1
-; GFX908-NEXT: v_readfirstlane_b32 s10, v2
-; GFX908-NEXT: v_readfirstlane_b32 s11, v3
-; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
-; GFX908-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
-; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
+; GFX908-NEXT: v_mov_b32_e32 v4, v0
+; GFX908-NEXT: v_mov_b32_e32 v0, s8
+; GFX908-NEXT: v_mov_b32_e32 v5, v1
+; GFX908-NEXT: buffer_load_dwordx2 v[0:1], v0, s[4:7], 0 offen offset:2048
+; GFX908-NEXT: s_add_i32 s10, s8, 0x800
+; GFX908-NEXT: s_mov_b64 s[8:9], 0
+; GFX908-NEXT: v_mov_b32_e32 v6, s10
+; GFX908-NEXT: .LBB11_1: ; %atomicrmw.start
+; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc
-; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5]
-; GFX908-NEXT: s_cbranch_execnz .LBB11_4
-; GFX908-NEXT: ; %bb.5: ; in Loop: Header=BB11_3 Depth=1
-; GFX908-NEXT: s_mov_b64 exec, s[12:13]
+; GFX908-NEXT: v_mov_b32_e32 v10, v1
+; GFX908-NEXT: v_mov_b32_e32 v9, v0
+; GFX908-NEXT: v_add_f64 v[7:8], v[9:10], v[4:5]
+; GFX908-NEXT: v_mov_b32_e32 v0, v7
+; GFX908-NEXT: v_mov_b32_e32 v1, v8
+; GFX908-NEXT: v_mov_b32_e32 v2, v9
+; GFX908-NEXT: v_mov_b32_e32 v3, v10
+; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[4:7], 0 offen glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6
-; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX908-NEXT: v_mov_b32_e32 v6, v4
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7]
-; GFX908-NEXT: s_cbranch_execnz .LBB11_3
-; GFX908-NEXT: ; %bb.6: ; %atomicrmw.end
-; GFX908-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX908-NEXT: v_lshrrev_b32_e32 v0, v7, v4
+; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
+; GFX908-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; GFX908-NEXT: s_andn2_b64 exec, exec, s[8:9]
+; GFX908-NEXT: s_cbranch_execnz .LBB11_1
+; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
-; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall:
+; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_remote_memory:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x200, v4
-; GFX8-NEXT: v_and_b32_e32 v8, -4, v4
-; GFX8-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX8-NEXT: v_lshlrev_b32_e32 v7, 3, v4
-; GFX8-NEXT: s_mov_b32 s4, 0xffff
-; GFX8-NEXT: v_lshlrev_b32_e64 v4, v7, s4
-; GFX8-NEXT: v_not_b32_e32 v9, v4
-; GFX8-NEXT: s_mov_b64 s[6:7], exec
-; GFX8-NEXT: .LBB11_1: ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT: v_readfirstlane_b32 s8, v0
-; GFX8-NEXT: v_readfirstlane_b32 s9, v1
-; GFX8-NEXT: v_readfirstlane_b32 s10, v2
-; GFX8-NEXT: v_readfirstlane_b32 s11, v3
-; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
-; GFX8-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
-; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
-; GFX8-NEXT: s_nop 0
-; GFX8-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen
-; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5]
-; GFX8-NEXT: s_cbranch_execnz .LBB11_1
-; GFX8-NEXT: ; %bb.2:
-; GFX8-NEXT: s_mov_b64 exec, s[6:7]
-; GFX8-NEXT: s_mov_b64 s[6:7], 0
-; GFX8-NEXT: v_lshlrev_b32_e32 v10, 16, v5
-; GFX8-NEXT: .LBB11_3: ; %atomicrmw.start
-; GFX8-NEXT: ; =>This Loop Header: Depth=1
-; GFX8-NEXT: ; Child Loop BB11_4 Depth 2
-; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_lshrrev_b32_sdwa v4, v7, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT: v_add_f32_e32 v4, v4, v10
-; GFX8-NEXT: v_bfe_u32 v5, v4, 16, 1
-; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v4
-; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5
-; GFX8-NEXT: v_or_b32_e32 v11, 0x400000, v4
-; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
-; GFX8-NEXT: v_cndmask_b32_e32 v4, v5, v11, vcc
-; GFX8-NEXT: v_lshlrev_b32_sdwa v4, v7, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-NEXT: v_and_b32_e32 v5, v6, v9
-; GFX8-NEXT: v_or_b32_e32 v5, v5, v4
-; GFX8-NEXT: v_mov_b32_e32 v4, v5
-; GFX8-NEXT: s_mov_b64 s[12:13], exec
-; GFX8-NEXT: v_mov_b32_e32 v5, v6
-; GFX8-NEXT: .LBB11_4: ; Parent Loop BB11_3 Depth=1
-; GFX8-NEXT: ; => This Inner Loop Header: Depth=2
-; GFX8-NEXT: v_readfirstlane_b32 s8, v0
-; GFX8-NEXT: v_readfirstlane_b32 s9, v1
-; GFX8-NEXT: v_readfirstlane_b32 s10, v2
-; GFX8-NEXT: v_readfirstlane_b32 s11, v3
-; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
-; GFX8-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
-; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
+; GFX8-NEXT: v_mov_b32_e32 v4, v0
+; GFX8-NEXT: v_mov_b32_e32 v0, s8
+; GFX8-NEXT: v_mov_b32_e32 v5, v1
+; GFX8-NEXT: buffer_load_dwordx2 v[0:1], v0, s[4:7], 0 offen offset:2048
+; GFX8-NEXT: s_add_i32 s10, s8, 0x800
+; GFX8-NEXT: s_mov_b64 s[8:9], 0
+; GFX8-NEXT: v_mov_b32_e32 v6, s10
+; GFX8-NEXT: .LBB11_1: ; %atomicrmw.start
+; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc
-; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5]
-; GFX8-NEXT: s_cbranch_execnz .LBB11_4
-; GFX8-NEXT: ; %bb.5: ; in Loop: Header=BB11_3 Depth=1
-; GFX8-NEXT: s_mov_b64 exec, s[12:13]
+; GFX8-NEXT: v_mov_b32_e32 v10, v1
+; GFX8-NEXT: v_mov_b32_e32 v9, v0
+; GFX8-NEXT: v_add_f64 v[7:8], v[9:10], v[4:5]
+; GFX8-NEXT: v_mov_b32_e32 v0, v7
+; GFX8-NEXT: v_mov_b32_e32 v1, v8
+; GFX8-NEXT: v_mov_b32_e32 v2, v9
+; GFX8-NEXT: v_mov_b32_e32 v3, v10
+; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[4:7], 0 offen glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6
-; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX8-NEXT: v_mov_b32_e32 v6, v4
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7]
-; GFX8-NEXT: s_cbranch_execnz .LBB11_3
-; GFX8-NEXT: ; %bb.6: ; %atomicrmw.end
-; GFX8-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, v7, v4
+; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
+; GFX8-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; GFX8-NEXT: s_andn2_b64 exec, exec, s[8:9]
+; GFX8-NEXT: s_cbranch_execnz .LBB11_1
+; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall:
+; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_remote_memory:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x200, v4
-; GFX7-NEXT: v_and_b32_e32 v8, -4, v4
-; GFX7-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX7-NEXT: v_lshlrev_b32_e32 v7, 3, v4
-; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v7
-; GFX7-NEXT: v_not_b32_e32 v9, v4
-; GFX7-NEXT: s_mov_b64 s[6:7], exec
-; GFX7-NEXT: .LBB11_1: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: v_readfirstlane_b32 s8, v0
-; GFX7-NEXT: v_readfirstlane_b32 s9, v1
-; GFX7-NEXT: v_readfirstlane_b32 s10, v2
-; GFX7-NEXT: v_readfirstlane_b32 s11, v3
-; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
-; GFX7-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
-; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
-; GFX7-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen
-; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5]
-; GFX7-NEXT: s_cbranch_execnz .LBB11_1
-; GFX7-NEXT: ; %bb.2:
-; GFX7-NEXT: s_mov_b64 exec, s[6:7]
-; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v5
-; GFX7-NEXT: s_mov_b64 s[6:7], 0
-; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v4
-; GFX7-NEXT: .LBB11_3: ; %atomicrmw.start
-; GFX7-NEXT: ; =>This Loop Header: Depth=1
-; GFX7-NEXT: ; Child Loop BB11_4 Depth 2
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v4, v7, v6
-; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4
-; GFX7-NEXT: v_add_f32_e32 v4, v4, v10
-; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; GFX7-NEXT: v_lshlrev_b32_e32 v4, v7, v4
-; GFX7-NEXT: v_and_b32_e32 v5, v6, v9
-; GFX7-NEXT: v_or_b32_e32 v5, v5, v4
-; GFX7-NEXT: v_mov_b32_e32 v4, v5
-; GFX7-NEXT: s_mov_b64 s[12:13], exec
-; GFX7-NEXT: v_mov_b32_e32 v5, v6
-; GFX7-NEXT: .LBB11_4: ; Parent Loop BB11_3 Depth=1
-; GFX7-NEXT: ; => This Inner Loop Header: Depth=2
-; GFX7-NEXT: v_readfirstlane_b32 s8, v0
-; GFX7-NEXT: v_readfirstlane_b32 s9, v1
-; GFX7-NEXT: v_readfirstlane_b32 s10, v2
-; GFX7-NEXT: v_readfirstlane_b32 s11, v3
-; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
-; GFX7-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
-; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
+; GFX7-NEXT: v_mov_b32_e32 v4, v0
+; GFX7-NEXT: v_mov_b32_e32 v0, s8
+; GFX7-NEXT: v_mov_b32_e32 v5, v1
+; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v0, s[4:7], 0 offen offset:2048
+; GFX7-NEXT: s_add_i32 s10, s8, 0x800
+; GFX7-NEXT: s_mov_b64 s[8:9], 0
+; GFX7-NEXT: v_mov_b32_e32 v6, s10
+; GFX7-NEXT: .LBB11_1: ; %atomicrmw.start
+; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc
-; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5]
-; GFX7-NEXT: s_cbranch_execnz .LBB11_4
-; GFX7-NEXT: ; %bb.5: ; in Loop: Header=BB11_3 Depth=1
-; GFX7-NEXT: s_mov_b64 exec, s[12:13]
+; GFX7-NEXT: v_mov_b32_e32 v10, v1
+; GFX7-NEXT: v_mov_b32_e32 v9, v0
+; GFX7-NEXT: v_add_f64 v[7:8], v[9:10], v[4:5]
+; GFX7-NEXT: v_mov_b32_e32 v0, v7
+; GFX7-NEXT: v_mov_b32_e32 v1, v8
+; GFX7-NEXT: v_mov_b32_e32 v2, v9
+; GFX7-NEXT: v_mov_b32_e32 v3, v10
+; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[4:7], 0 offen glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6
-; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX7-NEXT: v_mov_b32_e32 v6, v4
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7]
-; GFX7-NEXT: s_cbranch_execnz .LBB11_3
-; GFX7-NEXT: ; %bb.6: ; %atomicrmw.end
-; GFX7-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, v7, v4
-; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
+; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
+; GFX7-NEXT: s_cbranch_execnz .LBB11_1
+; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
-; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall:
+; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_remote_memory:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0x200, v4
-; GFX6-NEXT: v_and_b32_e32 v8, -4, v4
-; GFX6-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX6-NEXT: v_lshlrev_b32_e32 v7, 3, v4
-; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v7
-; GFX6-NEXT: v_not_b32_e32 v9, v4
-; GFX6-NEXT: s_mov_b64 s[6:7], exec
-; GFX6-NEXT: .LBB11_1: ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT: v_readfirstlane_b32 s8, v0
-; GFX6-NEXT: v_readfirstlane_b32 s9, v1
-; GFX6-NEXT: v_readfirstlane_b32 s10, v2
-; GFX6-NEXT: v_readfirstlane_b32 s11, v3
-; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
-; GFX6-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
-; GFX6-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
-; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
-; GFX6-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen
-; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5]
-; GFX6-NEXT: s_cbranch_execnz .LBB11_1
-; GFX6-NEXT: ; %bb.2:
-; GFX6-NEXT: s_mov_b64 exec, s[6:7]
-; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v5
-; GFX6-NEXT: s_mov_b64 s[6:7], 0
-; GFX6-NEXT: v_and_b32_e32 v10, 0xffff0000, v4
-; GFX6-NEXT: .LBB11_3: ; %atomicrmw.start
-; GFX6-NEXT: ; =>This Loop Header: Depth=1
-; GFX6-NEXT: ; Child Loop BB11_4 Depth 2
-; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0)
-; GFX6-NEXT: v_lshrrev_b32_e32 v4, v7, v6
-; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4
-; GFX6-NEXT: v_add_f32_e32 v4, v4, v10
-; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; GFX6-NEXT: v_lshlrev_b32_e32 v4, v7, v4
-; GFX6-NEXT: v_and_b32_e32 v5, v6, v9
-; GFX6-NEXT: v_or_b32_e32 v5, v5, v4
-; GFX6-NEXT: v_mov_b32_e32 v4, v5
-; GFX6-NEXT: s_mov_b64 s[12:13], exec
-; GFX6-NEXT: v_mov_b32_e32 v5, v6
-; GFX6-NEXT: .LBB11_4: ; Parent Loop BB11_3 Depth=1
-; GFX6-NEXT: ; => This Inner Loop Header: Depth=2
-; GFX6-NEXT: v_readfirstlane_b32 s8, v0
-; GFX6-NEXT: v_readfirstlane_b32 s9, v1
-; GFX6-NEXT: v_readfirstlane_b32 s10, v2
-; GFX6-NEXT: v_readfirstlane_b32 s11, v3
-; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
-; GFX6-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
-; GFX6-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
-; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
-; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0)
-; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc
-; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5]
-; GFX6-NEXT: s_cbranch_execnz .LBB11_4
-; GFX6-NEXT: ; %bb.5: ; in Loop: Header=BB11_3 Depth=1
-; GFX6-NEXT: s_mov_b64 exec, s[12:13]
+; GFX6-NEXT: v_mov_b32_e32 v4, v0
+; GFX6-NEXT: v_mov_b32_e32 v0, s8
+; GFX6-NEXT: v_mov_b32_e32 v5, v1
+; GFX6-NEXT: buffer_load_dwordx2 v[0:1], v0, s[4:7], 0 offen offset:2048
+; GFX6-NEXT: s_add_i32 s10, s8, 0x800
+; GFX6-NEXT: s_mov_b64 s[8:9], 0
+; GFX6-NEXT: v_mov_b32_e32 v6, s10
+; GFX6-NEXT: .LBB11_1: ; %atomicrmw.start
+; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v10, v1
+; GFX6-NEXT: v_mov_b32_e32 v9, v0
+; GFX6-NEXT: v_add_f64 v[7:8], v[9:10], v[4:5]
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v0, v7
+; GFX6-NEXT: v_mov_b32_e32 v1, v8
+; GFX6-NEXT: v_mov_b32_e32 v2, v9
+; GFX6-NEXT: v_mov_b32_e32 v3, v10
+; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[4:7], 0 offen glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6
-; GFX6-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX6-NEXT: v_mov_b32_e32 v6, v4
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: s_andn2_b64 exec, exec, s[6:7]
-; GFX6-NEXT: s_cbranch_execnz .LBB11_3
-; GFX6-NEXT: ; %bb.6: ; %atomicrmw.end
-; GFX6-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX6-NEXT: v_lshrrev_b32_e32 v0, v7, v4
-; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
+; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
+; GFX6-NEXT: s_cbranch_execnz .LBB11_1
+; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
- %gep = getelementptr bfloat, ptr addrspace(7) %ptr, i32 256
- %result = atomicrmw fadd ptr addrspace(7) %gep, bfloat %val syncscope("agent") seq_cst
- ret bfloat %result
+ %gep = getelementptr double, ptr addrspace(7) %ptr, i32 256
+ %result = atomicrmw fadd ptr addrspace(7) %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
+ ret double %result
}
-; --------------------------------------------------------------------
-; <2 x half>
-; --------------------------------------------------------------------
-
-define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset(ptr addrspace(7) inreg %ptr, <2 x half> %val) #0 {
-; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset:
+define double @buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(7) inreg %ptr, double %val) #0 {
+; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: s_wait_expcnt 0x0
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v1, s4
+; GFX12-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
+; GFX12-NEXT: v_mov_b32_e32 v0, s4
+; GFX12-NEXT: s_addk_co_i32 s4, 0x800
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: v_mov_b32_e32 v6, s4
+; GFX12-NEXT: s_mov_b32 s4, 0
+; GFX12-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], null offen offset:2048
+; GFX12-NEXT: .LBB12_1: ; %atomicrmw.start
+; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_add_f64_e32 v[7:8], v[9:10], v[4:5]
+; GFX12-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
+; GFX12-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
+; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], null offen th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10]
+; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX12-NEXT: s_cbranch_execnz .LBB12_1
+; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
-; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset:
+; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-NEXT: v_mov_b32_e32 v2, s4
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[0:3], 0 offen offset:1024 sc0
+; GFX940-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 offen offset:2048 sc0
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc1
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset:
+; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v2, v0
+; GFX11-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
; GFX11-NEXT: v_mov_b32_e32 v0, s4
-; GFX11-NEXT: s_addk_i32 s4, 0x400
+; GFX11-NEXT: s_addk_i32 s4, 0x800
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: v_mov_b32_e32 v3, s4
+; GFX11-NEXT: v_mov_b32_e32 v6, s4
; GFX11-NEXT: s_mov_b32 s4, 0
-; GFX11-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024
+; GFX11-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], 0 offen offset:2048
; GFX11-NEXT: .LBB12_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v5, v0
+; GFX11-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_pk_add_f16 v4, v5, v2
-; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
-; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], 0 offen glc
+; GFX11-NEXT: v_add_f64 v[7:8], v[9:10], v[4:5]
+; GFX11-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
+; GFX11-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
+; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], 0 offen glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5
+; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10]
; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
@@ -4666,28 +3104,32 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset(ptr addrsp
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
-; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset:
+; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v2, v0
+; GFX10-NEXT: v_mov_b32_e32 v4, v0
; GFX10-NEXT: v_mov_b32_e32 v0, s8
-; GFX10-NEXT: s_addk_i32 s8, 0x400
-; GFX10-NEXT: v_mov_b32_e32 v3, s8
+; GFX10-NEXT: v_mov_b32_e32 v5, v1
+; GFX10-NEXT: s_addk_i32 s8, 0x800
+; GFX10-NEXT: v_mov_b32_e32 v6, s8
+; GFX10-NEXT: buffer_load_dwordx2 v[0:1], v0, s[4:7], 0 offen offset:2048
; GFX10-NEXT: s_mov_b32 s8, 0
-; GFX10-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024
; GFX10-NEXT: .LBB12_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v5, v0
+; GFX10-NEXT: v_mov_b32_e32 v10, v1
+; GFX10-NEXT: v_mov_b32_e32 v9, v0
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: v_pk_add_f16 v4, v5, v2
-; GFX10-NEXT: v_mov_b32_e32 v0, v4
-; GFX10-NEXT: v_mov_b32_e32 v1, v5
-; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc
+; GFX10-NEXT: v_add_f64 v[7:8], v[9:10], v[4:5]
+; GFX10-NEXT: v_mov_b32_e32 v0, v7
+; GFX10-NEXT: v_mov_b32_e32 v1, v8
+; GFX10-NEXT: v_mov_b32_e32 v2, v9
+; GFX10-NEXT: v_mov_b32_e32 v3, v10
+; GFX10-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[4:7], 0 offen glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5
+; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10]
; GFX10-NEXT: s_or_b32 s8, vcc_lo, s8
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s8
; GFX10-NEXT: s_cbranch_execnz .LBB12_1
@@ -4695,35 +3137,39 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset(ptr addrsp
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s8
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset:
+; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v1, s8
-; GFX90A-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[4:7], 0 offen offset:1024 glc
+; GFX90A-NEXT: v_mov_b32_e32 v2, s8
+; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[4:7], 0 offen offset:2048 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
-; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset:
+; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v2, v0
+; GFX908-NEXT: v_mov_b32_e32 v4, v0
; GFX908-NEXT: v_mov_b32_e32 v0, s8
-; GFX908-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024
-; GFX908-NEXT: s_add_i32 s10, s8, 0x400
+; GFX908-NEXT: v_mov_b32_e32 v5, v1
+; GFX908-NEXT: buffer_load_dwordx2 v[0:1], v0, s[4:7], 0 offen offset:2048
+; GFX908-NEXT: s_add_i32 s10, s8, 0x800
; GFX908-NEXT: s_mov_b64 s[8:9], 0
-; GFX908-NEXT: v_mov_b32_e32 v3, s10
+; GFX908-NEXT: v_mov_b32_e32 v6, s10
; GFX908-NEXT: .LBB12_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v5, v0
-; GFX908-NEXT: v_pk_add_f16 v4, v5, v2
-; GFX908-NEXT: v_mov_b32_e32 v0, v4
-; GFX908-NEXT: v_mov_b32_e32 v1, v5
-; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc
+; GFX908-NEXT: v_mov_b32_e32 v10, v1
+; GFX908-NEXT: v_mov_b32_e32 v9, v0
+; GFX908-NEXT: v_add_f64 v[7:8], v[9:10], v[4:5]
+; GFX908-NEXT: v_mov_b32_e32 v0, v7
+; GFX908-NEXT: v_mov_b32_e32 v1, v8
+; GFX908-NEXT: v_mov_b32_e32 v2, v9
+; GFX908-NEXT: v_mov_b32_e32 v3, v10
+; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[4:7], 0 offen glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
+; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
; GFX908-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
; GFX908-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX908-NEXT: s_cbranch_execnz .LBB12_1
@@ -4731,28 +3177,30 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset(ptr addrsp
; GFX908-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
-; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset:
+; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v2, v0
+; GFX8-NEXT: v_mov_b32_e32 v4, v0
; GFX8-NEXT: v_mov_b32_e32 v0, s8
-; GFX8-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024
-; GFX8-NEXT: s_add_i32 s10, s8, 0x400
+; GFX8-NEXT: v_mov_b32_e32 v5, v1
+; GFX8-NEXT: buffer_load_dwordx2 v[0:1], v0, s[4:7], 0 offen offset:2048
+; GFX8-NEXT: s_add_i32 s10, s8, 0x800
; GFX8-NEXT: s_mov_b64 s[8:9], 0
-; GFX8-NEXT: v_mov_b32_e32 v3, s10
+; GFX8-NEXT: v_mov_b32_e32 v6, s10
; GFX8-NEXT: .LBB12_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v5, v0
-; GFX8-NEXT: v_add_f16_sdwa v0, v5, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_add_f16_e32 v1, v5, v2
-; GFX8-NEXT: v_or_b32_e32 v4, v1, v0
-; GFX8-NEXT: v_mov_b32_e32 v0, v4
-; GFX8-NEXT: v_mov_b32_e32 v1, v5
-; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc
+; GFX8-NEXT: v_mov_b32_e32 v10, v1
+; GFX8-NEXT: v_mov_b32_e32 v9, v0
+; GFX8-NEXT: v_add_f64 v[7:8], v[9:10], v[4:5]
+; GFX8-NEXT: v_mov_b32_e32 v0, v7
+; GFX8-NEXT: v_mov_b32_e32 v1, v8
+; GFX8-NEXT: v_mov_b32_e32 v2, v9
+; GFX8-NEXT: v_mov_b32_e32 v3, v10
+; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[4:7], 0 offen glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
+; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
; GFX8-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
; GFX8-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX8-NEXT: s_cbranch_execnz .LBB12_1
@@ -4760,45 +3208,30 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset(ptr addrsp
; GFX8-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset:
+; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v2, s8
-; GFX7-NEXT: buffer_load_dword v3, v2, s[4:7], 0 offen offset:1024
-; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v0
-; GFX7-NEXT: s_add_i32 s10, s8, 0x400
+; GFX7-NEXT: v_mov_b32_e32 v4, v0
+; GFX7-NEXT: v_mov_b32_e32 v0, s8
+; GFX7-NEXT: v_mov_b32_e32 v5, v1
+; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v0, s[4:7], 0 offen offset:2048
+; GFX7-NEXT: s_add_i32 s10, s8, 0x800
; GFX7-NEXT: s_mov_b64 s[8:9], 0
-; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v1
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v3
-; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v3
-; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v4
-; GFX7-NEXT: v_mov_b32_e32 v4, s10
+; GFX7-NEXT: v_mov_b32_e32 v6, s10
; GFX7-NEXT: .LBB12_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v1
-; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v0
-; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX7-NEXT: v_add_f32_e32 v5, v5, v2
-; GFX7-NEXT: v_add_f32_e32 v6, v6, v3
-; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5
-; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v6
-; GFX7-NEXT: v_or_b32_e32 v6, v0, v1
-; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v5
-; GFX7-NEXT: v_or_b32_e32 v5, v7, v0
-; GFX7-NEXT: v_mov_b32_e32 v8, v6
-; GFX7-NEXT: v_mov_b32_e32 v7, v5
-; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v4, s[4:7], 0 offen glc
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v10, v1
+; GFX7-NEXT: v_mov_b32_e32 v9, v0
+; GFX7-NEXT: v_add_f64 v[7:8], v[9:10], v[4:5]
+; GFX7-NEXT: v_mov_b32_e32 v0, v7
+; GFX7-NEXT: v_mov_b32_e32 v1, v8
+; GFX7-NEXT: v_mov_b32_e32 v2, v9
+; GFX7-NEXT: v_mov_b32_e32 v3, v10
+; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[4:7], 0 offen glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v7
-; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v7
-; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6
+; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_cbranch_execnz .LBB12_1
@@ -4806,46 +3239,31 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset(ptr addrsp
; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
-; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset:
+; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f64__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v2, s8
-; GFX6-NEXT: buffer_load_dword v3, v2, s[4:7], 0 offen offset:1024
-; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v0
-; GFX6-NEXT: s_add_i32 s10, s8, 0x400
+; GFX6-NEXT: v_mov_b32_e32 v4, v0
+; GFX6-NEXT: v_mov_b32_e32 v0, s8
+; GFX6-NEXT: v_mov_b32_e32 v5, v1
+; GFX6-NEXT: buffer_load_dwordx2 v[0:1], v0, s[4:7], 0 offen offset:2048
+; GFX6-NEXT: s_add_i32 s10, s8, 0x800
; GFX6-NEXT: s_mov_b64 s[8:9], 0
-; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v1
-; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v3
-; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v3
-; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v4
-; GFX6-NEXT: v_mov_b32_e32 v4, s10
+; GFX6-NEXT: v_mov_b32_e32 v6, s10
; GFX6-NEXT: .LBB12_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v1
-; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v0
-; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX6-NEXT: v_add_f32_e32 v5, v5, v2
-; GFX6-NEXT: v_add_f32_e32 v6, v6, v3
-; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v10, v1
+; GFX6-NEXT: v_mov_b32_e32 v9, v0
+; GFX6-NEXT: v_add_f64 v[7:8], v[9:10], v[4:5]
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v6
-; GFX6-NEXT: v_or_b32_e32 v6, v0, v1
-; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v5
-; GFX6-NEXT: v_or_b32_e32 v5, v7, v0
-; GFX6-NEXT: v_mov_b32_e32 v8, v6
-; GFX6-NEXT: v_mov_b32_e32 v7, v5
-; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v4, s[4:7], 0 offen glc
+; GFX6-NEXT: v_mov_b32_e32 v0, v7
+; GFX6-NEXT: v_mov_b32_e32 v1, v8
+; GFX6-NEXT: v_mov_b32_e32 v2, v9
+; GFX6-NEXT: v_mov_b32_e32 v3, v10
+; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[4:7], 0 offen glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v7
-; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v7
-; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6
+; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_cbranch_execnz .LBB12_1
@@ -4853,1019 +3271,7233 @@ define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset(ptr addrsp
; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
- %gep = getelementptr <2 x half>, ptr addrspace(7) %ptr, i32 256
- %result = atomicrmw fadd ptr addrspace(7) %gep, <2 x half> %val syncscope("agent") seq_cst
- ret <2 x half> %result
+ %gep = getelementptr double, ptr addrspace(7) %ptr, i32 256
+ %result = atomicrmw fadd ptr addrspace(7) %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
+ ret double %result
}
-define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset(ptr addrspace(7) inreg %ptr, <2 x half> %val) #0 {
-; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset:
+; --------------------------------------------------------------------
+; half
+; --------------------------------------------------------------------
+
+define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, half %val) #0 {
+; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_grained_memory:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: s_wait_expcnt 0x0
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v1, s4
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[0:3], null offen offset:1024
+; GFX12-NEXT: s_addk_co_i32 s4, 0x200
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_and_b32 s5, s4, -4
+; GFX12-NEXT: s_and_b32 s4, s4, 3
+; GFX12-NEXT: v_mov_b32_e32 v5, s5
+; GFX12-NEXT: s_lshl_b32 s4, s4, 3
+; GFX12-NEXT: s_lshl_b32 s5, 0xffff, s4
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_not_b32 s6, s5
+; GFX12-NEXT: buffer_load_b32 v2, v5, s[0:3], null offen
+; GFX12-NEXT: s_mov_b32 s5, 0
+; GFX12-NEXT: .LBB13_1: ; %atomicrmw.start
+; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: v_lshrrev_b32_e32 v1, s4, v2
; GFX12-NEXT: s_wait_storecnt 0x0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_add_f16_e32 v1, v1, v0
+; GFX12-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_lshlrev_b32_e32 v1, s4, v1
+; GFX12-NEXT: v_and_or_b32 v1, v2, s6, v1
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1
+; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2
+; GFX12-NEXT: v_mov_b32_e32 v2, v3
+; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
+; GFX12-NEXT: s_cbranch_execnz .LBB13_1
+; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5
+; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v3
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
-; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset:
+; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_grained_memory:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-NEXT: s_addk_i32 s4, 0x200
+; GFX940-NEXT: s_and_b32 s5, s4, -4
+; GFX940-NEXT: v_mov_b32_e32 v1, s5
+; GFX940-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen
+; GFX940-NEXT: s_and_b32 s4, s4, 3
+; GFX940-NEXT: s_lshl_b32 s6, s4, 3
+; GFX940-NEXT: s_lshl_b32 s4, 0xffff, s6
+; GFX940-NEXT: s_not_b32 s7, s4
+; GFX940-NEXT: s_mov_b64 s[4:5], 0
+; GFX940-NEXT: .LBB13_1: ; %atomicrmw.start
+; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX940-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NEXT: v_lshrrev_b32_e32 v2, s6, v3
+; GFX940-NEXT: v_add_f16_e32 v2, v2, v0
+; GFX940-NEXT: v_lshlrev_b32_e32 v2, s6, v2
+; GFX940-NEXT: v_and_or_b32 v2, v3, s7, v2
+; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[2:3]
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[0:3], 0 offen offset:1024
+; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v1, s[0:3], 0 offen sc0
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc1
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX940-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX940-NEXT: v_mov_b32_e32 v3, v4
+; GFX940-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX940-NEXT: s_cbranch_execnz .LBB13_1
+; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX940-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX940-NEXT: v_lshrrev_b32_e32 v0, s6, v4
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset:
+; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_grained_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v1, s4
-; GFX11-NEXT: s_addk_i32 s4, 0x400
+; GFX11-NEXT: s_addk_i32 s4, 0x200
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT: s_and_b32 s5, s4, -4
+; GFX11-NEXT: s_and_b32 s4, s4, 3
+; GFX11-NEXT: v_mov_b32_e32 v5, s5
+; GFX11-NEXT: s_lshl_b32 s4, s4, 3
+; GFX11-NEXT: s_lshl_b32 s5, 0xffff, s4
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: v_mov_b32_e32 v3, s4
-; GFX11-NEXT: s_mov_b32 s4, 0
-; GFX11-NEXT: buffer_load_b32 v2, v1, s[0:3], 0 offen offset:1024
+; GFX11-NEXT: s_not_b32 s6, s5
+; GFX11-NEXT: buffer_load_b32 v2, v5, s[0:3], 0 offen
+; GFX11-NEXT: s_mov_b32 s5, 0
; GFX11-NEXT: .LBB13_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_pk_add_f16 v1, v2, v0
-; GFX11-NEXT: v_mov_b32_e32 v5, v2
+; GFX11-NEXT: v_lshrrev_b32_e32 v1, s4, v2
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT: v_mov_b32_e32 v4, v1
-; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], 0 offen glc
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_add_f16_e32 v1, v1, v0
+; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_lshlrev_b32_e32 v1, s4, v1
+; GFX11-NEXT: v_and_or_b32 v1, v2, s6, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v3, v1
+; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[3:4], v5, s[0:3], 0 offen glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2
-; GFX11-NEXT: v_mov_b32_e32 v2, v4
-; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2
+; GFX11-NEXT: v_mov_b32_e32 v2, v3
+; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
; GFX11-NEXT: s_cbranch_execnz .LBB13_1
; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s5
+; GFX11-NEXT: v_lshrrev_b32_e32 v0, s4, v3
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
-; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset:
+; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v1, s8
-; GFX10-NEXT: s_addk_i32 s8, 0x400
-; GFX10-NEXT: v_mov_b32_e32 v3, s8
-; GFX10-NEXT: s_mov_b32 s8, 0
-; GFX10-NEXT: buffer_load_dword v2, v1, s[4:7], 0 offen offset:1024
+; GFX10-NEXT: s_addk_i32 s8, 0x200
+; GFX10-NEXT: s_and_b32 s9, s8, -4
+; GFX10-NEXT: s_and_b32 s8, s8, 3
+; GFX10-NEXT: v_mov_b32_e32 v5, s9
+; GFX10-NEXT: s_lshl_b32 s8, s8, 3
+; GFX10-NEXT: s_lshl_b32 s9, 0xffff, s8
+; GFX10-NEXT: s_not_b32 s10, s9
+; GFX10-NEXT: buffer_load_dword v2, v5, s[4:7], 0 offen
+; GFX10-NEXT: s_mov_b32 s9, 0
; GFX10-NEXT: .LBB13_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_pk_add_f16 v1, v2, v0
-; GFX10-NEXT: v_mov_b32_e32 v5, v2
+; GFX10-NEXT: v_lshrrev_b32_e32 v1, s8, v2
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: v_mov_b32_e32 v4, v1
-; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[4:7], 0 offen glc
+; GFX10-NEXT: v_add_f16_e32 v1, v1, v0
+; GFX10-NEXT: v_lshlrev_b32_sdwa v1, s8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX10-NEXT: v_and_or_b32 v1, v2, s10, v1
+; GFX10-NEXT: v_mov_b32_e32 v4, v2
+; GFX10-NEXT: v_mov_b32_e32 v3, v1
+; GFX10-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[4:7], 0 offen glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2
-; GFX10-NEXT: v_mov_b32_e32 v2, v4
-; GFX10-NEXT: s_or_b32 s8, vcc_lo, s8
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s8
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v2
+; GFX10-NEXT: v_mov_b32_e32 v2, v3
+; GFX10-NEXT: s_or_b32 s9, vcc_lo, s9
+; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s9
; GFX10-NEXT: s_cbranch_execnz .LBB13_1
; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s8
+; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s9
+; GFX10-NEXT: v_lshrrev_b32_e32 v0, s8, v3
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset:
+; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v1, s8
-; GFX90A-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[4:7], 0 offen offset:1024
+; GFX90A-NEXT: s_addk_i32 s8, 0x200
+; GFX90A-NEXT: s_and_b32 s9, s8, -4
+; GFX90A-NEXT: v_mov_b32_e32 v1, s9
+; GFX90A-NEXT: buffer_load_dword v3, v1, s[4:7], 0 offen
+; GFX90A-NEXT: s_and_b32 s8, s8, 3
+; GFX90A-NEXT: s_lshl_b32 s10, s8, 3
+; GFX90A-NEXT: s_lshl_b32 s8, 0xffff, s10
+; GFX90A-NEXT: s_not_b32 s11, s8
+; GFX90A-NEXT: s_mov_b64 s[8:9], 0
+; GFX90A-NEXT: .LBB13_1: ; %atomicrmw.start
+; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: v_lshrrev_b32_e32 v2, s10, v3
+; GFX90A-NEXT: v_add_f16_e32 v2, v2, v0
+; GFX90A-NEXT: v_lshlrev_b32_e32 v2, s10, v2
+; GFX90A-NEXT: v_and_or_b32 v2, v3, s11, v2
+; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
+; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v1, s[4:7], 0 offen glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX90A-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; GFX90A-NEXT: v_mov_b32_e32 v3, v4
+; GFX90A-NEXT: s_andn2_b64 exec, exec, s[8:9]
+; GFX90A-NEXT: s_cbranch_execnz .LBB13_1
+; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX90A-NEXT: s_or_b64 exec, exec, s[8:9]
+; GFX90A-NEXT: v_lshrrev_b32_e32 v0, s10, v4
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
-; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset:
+; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_grained_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v1, s8
-; GFX908-NEXT: buffer_load_dword v2, v1, s[4:7], 0 offen offset:1024
-; GFX908-NEXT: s_add_i32 s10, s8, 0x400
+; GFX908-NEXT: s_addk_i32 s8, 0x200
+; GFX908-NEXT: s_and_b32 s9, s8, -4
+; GFX908-NEXT: v_mov_b32_e32 v5, s9
+; GFX908-NEXT: buffer_load_dword v2, v5, s[4:7], 0 offen
+; GFX908-NEXT: s_and_b32 s8, s8, 3
+; GFX908-NEXT: s_lshl_b32 s10, s8, 3
+; GFX908-NEXT: s_lshl_b32 s8, 0xffff, s10
+; GFX908-NEXT: s_not_b32 s11, s8
; GFX908-NEXT: s_mov_b64 s[8:9], 0
-; GFX908-NEXT: v_mov_b32_e32 v3, s10
; GFX908-NEXT: .LBB13_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_pk_add_f16 v1, v2, v0
-; GFX908-NEXT: v_mov_b32_e32 v5, v2
-; GFX908-NEXT: v_mov_b32_e32 v4, v1
-; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[4:7], 0 offen glc
+; GFX908-NEXT: v_lshrrev_b32_e32 v1, s10, v2
+; GFX908-NEXT: v_add_f16_e32 v1, v1, v0
+; GFX908-NEXT: v_lshlrev_b32_e32 v1, s10, v1
+; GFX908-NEXT: v_and_or_b32 v1, v2, s11, v1
+; GFX908-NEXT: v_mov_b32_e32 v4, v2
+; GFX908-NEXT: v_mov_b32_e32 v3, v1
+; GFX908-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[4:7], 0 offen glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v2
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2
; GFX908-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX908-NEXT: v_mov_b32_e32 v2, v4
+; GFX908-NEXT: v_mov_b32_e32 v2, v3
; GFX908-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX908-NEXT: s_cbranch_execnz .LBB13_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX908-NEXT: s_or_b64 exec, exec, s[8:9]
+; GFX908-NEXT: v_lshrrev_b32_e32 v0, s10, v3
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
-; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset:
+; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_grained_memory:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, s8
-; GFX8-NEXT: buffer_load_dword v2, v1, s[4:7], 0 offen offset:1024
-; GFX8-NEXT: s_add_i32 s10, s8, 0x400
+; GFX8-NEXT: s_addk_i32 s8, 0x200
+; GFX8-NEXT: s_and_b32 s9, s8, -4
+; GFX8-NEXT: v_mov_b32_e32 v5, s9
+; GFX8-NEXT: buffer_load_dword v2, v5, s[4:7], 0 offen
+; GFX8-NEXT: s_and_b32 s8, s8, 3
+; GFX8-NEXT: s_lshl_b32 s10, s8, 3
+; GFX8-NEXT: s_lshl_b32 s8, 0xffff, s10
+; GFX8-NEXT: s_not_b32 s11, s8
; GFX8-NEXT: s_mov_b64 s[8:9], 0
-; GFX8-NEXT: v_mov_b32_e32 v3, s10
; GFX8-NEXT: .LBB13_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_add_f16_sdwa v1, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_add_f16_e32 v4, v2, v0
-; GFX8-NEXT: v_or_b32_e32 v1, v4, v1
-; GFX8-NEXT: v_mov_b32_e32 v5, v2
-; GFX8-NEXT: v_mov_b32_e32 v4, v1
-; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[4:7], 0 offen glc
-; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v2
-; GFX8-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX8-NEXT: v_mov_b32_e32 v2, v4
-; GFX8-NEXT: s_andn2_b64 exec, exec, s[8:9]
+; GFX8-NEXT: v_lshrrev_b32_e32 v1, s10, v2
+; GFX8-NEXT: v_add_f16_e32 v1, v1, v0
+; GFX8-NEXT: v_and_b32_e32 v3, s11, v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, s10, v1
+; GFX8-NEXT: v_or_b32_e32 v1, v3, v1
+; GFX8-NEXT: v_mov_b32_e32 v4, v2
+; GFX8-NEXT: v_mov_b32_e32 v3, v1
+; GFX8-NEXT: buffer_atomic_cmpswap v[3:4], v5, s[4:7], 0 offen glc
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: buffer_wbinvl1
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2
+; GFX8-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; GFX8-NEXT: v_mov_b32_e32 v2, v3
+; GFX8-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX8-NEXT: s_cbranch_execnz .LBB13_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX8-NEXT: s_or_b64 exec, exec, s[8:9]
+; GFX8-NEXT: v_lshrrev_b32_e32 v0, s10, v3
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset:
+; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_grained_memory:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v2, s8
-; GFX7-NEXT: buffer_load_dword v2, v2, s[4:7], 0 offen offset:1024
-; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v0
-; GFX7-NEXT: s_add_i32 s10, s8, 0x400
+; GFX7-NEXT: s_addk_i32 s8, 0x200
+; GFX7-NEXT: s_and_b32 s9, s8, -4
+; GFX7-NEXT: v_mov_b32_e32 v4, s9
+; GFX7-NEXT: buffer_load_dword v1, v4, s[4:7], 0 offen
+; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT: s_and_b32 s8, s8, 3
+; GFX7-NEXT: s_lshl_b32 s10, s8, 3
+; GFX7-NEXT: s_lshl_b32 s8, 0xffff, s10
+; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v0
+; GFX7-NEXT: s_not_b32 s11, s8
; GFX7-NEXT: s_mov_b64 s[8:9], 0
-; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v1
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v2
-; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v2
-; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v1
-; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v5
-; GFX7-NEXT: v_mov_b32_e32 v2, s10
; GFX7-NEXT: .LBB13_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4
-; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3
-; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v4
-; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v3
-; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4
-; GFX7-NEXT: v_add_f32_e32 v5, v5, v0
-; GFX7-NEXT: v_add_f32_e32 v6, v6, v1
-; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v5
-; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6
-; GFX7-NEXT: v_or_b32_e32 v5, v3, v4
-; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v7
-; GFX7-NEXT: v_or_b32_e32 v4, v6, v3
-; GFX7-NEXT: v_mov_b32_e32 v7, v5
-; GFX7-NEXT: v_mov_b32_e32 v6, v4
-; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v2, s[4:7], 0 offen glc
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, s10, v1
+; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT: v_and_b32_e32 v2, s11, v1
+; GFX7-NEXT: v_add_f32_e32 v0, v0, v5
+; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, s10, v0
+; GFX7-NEXT: v_or_b32_e32 v0, v2, v0
+; GFX7-NEXT: v_mov_b32_e32 v3, v1
+; GFX7-NEXT: v_mov_b32_e32 v2, v0
+; GFX7-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[4:7], 0 offen glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v6
-; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6
-; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT: v_mov_b32_e32 v1, v2
; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_cbranch_execnz .LBB13_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, s10, v2
+; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
-; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset:
+; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_grained_memory:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v2, s8
-; GFX6-NEXT: buffer_load_dword v2, v2, s[4:7], 0 offen offset:1024
-; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v0
-; GFX6-NEXT: s_add_i32 s10, s8, 0x400
+; GFX6-NEXT: s_addk_i32 s8, 0x200
+; GFX6-NEXT: s_and_b32 s9, s8, -4
+; GFX6-NEXT: v_mov_b32_e32 v4, s9
+; GFX6-NEXT: buffer_load_dword v1, v4, s[4:7], 0 offen
+; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX6-NEXT: s_and_b32 s8, s8, 3
+; GFX6-NEXT: s_lshl_b32 s10, s8, 3
+; GFX6-NEXT: s_lshl_b32 s8, 0xffff, s10
+; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v0
+; GFX6-NEXT: s_not_b32 s11, s8
; GFX6-NEXT: s_mov_b64 s[8:9], 0
-; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v1
-; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v2
-; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v2
-; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v1
-; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v5
-; GFX6-NEXT: v_mov_b32_e32 v2, s10
; GFX6-NEXT: .LBB13_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4
-; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3
-; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v4
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_lshrrev_b32_e32 v0, s10, v1
+; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v3
-; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4
-; GFX6-NEXT: v_add_f32_e32 v5, v5, v0
-; GFX6-NEXT: v_add_f32_e32 v6, v6, v1
-; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v5
-; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6
-; GFX6-NEXT: v_or_b32_e32 v5, v3, v4
-; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v7
-; GFX6-NEXT: v_or_b32_e32 v4, v6, v3
-; GFX6-NEXT: v_mov_b32_e32 v7, v5
-; GFX6-NEXT: v_mov_b32_e32 v6, v4
-; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v2, s[4:7], 0 offen glc
+; GFX6-NEXT: v_and_b32_e32 v2, s11, v1
+; GFX6-NEXT: v_add_f32_e32 v0, v0, v5
+; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX6-NEXT: v_lshlrev_b32_e32 v0, s10, v0
+; GFX6-NEXT: v_or_b32_e32 v0, v2, v0
+; GFX6-NEXT: v_mov_b32_e32 v3, v1
+; GFX6-NEXT: v_mov_b32_e32 v2, v0
+; GFX6-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[4:7], 0 offen glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v6
-; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6
-; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT: v_mov_b32_e32 v1, v2
; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_cbranch_execnz .LBB13_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
+; GFX6-NEXT: v_lshrrev_b32_e32 v0, s10, v2
+; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
- %gep = getelementptr <2 x half>, ptr addrspace(7) %ptr, i32 256
- %unused = atomicrmw fadd ptr addrspace(7) %gep, <2 x half> %val syncscope("agent") seq_cst
- ret void
+ %gep = getelementptr half, ptr addrspace(7) %ptr, i32 256
+ %result = atomicrmw fadd ptr addrspace(7) %gep, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+ ret half %result
}
-define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall(ptr addrspace(7) %ptr, <2 x half> %val) #0 {
-; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall:
+define void @buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, half %val) #0 {
+; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_grained_memory:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: s_wait_expcnt 0x0
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_mov_b32 s1, exec_lo
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: .LBB14_1: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: v_readfirstlane_b32 s4, v0
-; GFX12-NEXT: v_readfirstlane_b32 s5, v1
-; GFX12-NEXT: v_readfirstlane_b32 s6, v2
-; GFX12-NEXT: v_readfirstlane_b32 s7, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
-; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_and_saveexec_b32 s0, s0
+; GFX12-NEXT: s_addk_co_i32 s4, 0x200
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_and_b32 s5, s4, -4
+; GFX12-NEXT: s_and_b32 s4, s4, 3
+; GFX12-NEXT: v_mov_b32_e32 v3, s5
+; GFX12-NEXT: s_lshl_b32 s4, s4, 3
+; GFX12-NEXT: s_lshl_b32 s5, 0xffff, s4
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_not_b32 s6, s5
+; GFX12-NEXT: buffer_load_b32 v2, v3, s[0:3], null offen
+; GFX12-NEXT: s_mov_b32 s5, 0
+; GFX12-NEXT: .LBB14_1: ; %atomicrmw.start
+; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: buffer_atomic_pk_add_f16 v5, v4, s[4:7], null offen offset:1024 th:TH_ATOMIC_RETURN
-; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
-; GFX12-NEXT: ; implicit-def: $vgpr4
-; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB14_1
-; GFX12-NEXT: ; %bb.2:
-; GFX12-NEXT: s_mov_b32 exec_lo, s1
+; GFX12-NEXT: v_lshrrev_b32_e32 v1, s4, v2
+; GFX12-NEXT: s_wait_storecnt 0x0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_add_f16_e32 v1, v1, v0
+; GFX12-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_lshlrev_b32_e32 v1, s4, v1
+; GFX12-NEXT: v_and_or_b32 v1, v2, s6, v1
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_mov_b32 v4, v1
+; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], null offen th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v0, v5
; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2
+; GFX12-NEXT: v_mov_b32_e32 v2, v4
+; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
+; GFX12-NEXT: s_cbranch_execnz .LBB14_1
+; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
-; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall:
+; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_grained_memory:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: s_mov_b64 s[2:3], exec
-; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: .LBB14_1: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: v_readfirstlane_b32 s4, v0
-; GFX940-NEXT: v_readfirstlane_b32 s5, v1
-; GFX940-NEXT: v_readfirstlane_b32 s6, v2
-; GFX940-NEXT: v_readfirstlane_b32 s7, v3
-; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1]
-; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
-; GFX940-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT: s_and_saveexec_b64 s[0:1], s[0:1]
+; GFX940-NEXT: s_addk_i32 s4, 0x200
+; GFX940-NEXT: s_and_b32 s5, s4, -4
+; GFX940-NEXT: v_mov_b32_e32 v1, s5
+; GFX940-NEXT: buffer_load_dword v3, v1, s[0:3], 0 offen
+; GFX940-NEXT: s_and_b32 s4, s4, 3
+; GFX940-NEXT: s_lshl_b32 s6, s4, 3
+; GFX940-NEXT: s_lshl_b32 s4, 0xffff, s6
+; GFX940-NEXT: s_not_b32 s7, s4
+; GFX940-NEXT: s_mov_b64 s[4:5], 0
+; GFX940-NEXT: .LBB14_1: ; %atomicrmw.start
+; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: buffer_atomic_pk_add_f16 v5, v4, s[4:7], 0 offen offset:1024 sc0
-; GFX940-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
-; GFX940-NEXT: ; implicit-def: $vgpr4
-; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1]
-; GFX940-NEXT: s_cbranch_execnz .LBB14_1
-; GFX940-NEXT: ; %bb.2:
-; GFX940-NEXT: s_mov_b64 exec, s[2:3]
+; GFX940-NEXT: v_lshrrev_b32_e32 v2, s6, v3
+; GFX940-NEXT: v_add_f16_e32 v2, v2, v0
+; GFX940-NEXT: v_lshlrev_b32_e32 v2, s6, v2
+; GFX940-NEXT: v_and_or_b32 v2, v3, s7, v2
+; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[2:3]
+; GFX940-NEXT: buffer_wbl2 sc1
+; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v1, s[0:3], 0 offen sc0
; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: v_mov_b32_e32 v0, v5
; GFX940-NEXT: buffer_inv sc1
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX940-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX940-NEXT: v_mov_b32_e32 v3, v4
+; GFX940-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX940-NEXT: s_cbranch_execnz .LBB14_1
+; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX940-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall:
+; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_grained_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_add_nc_u32_e32 v9, 0x400, v4
-; GFX11-NEXT: s_mov_b32 s1, 0
-; GFX11-NEXT: s_mov_b32 s2, exec_lo
-; GFX11-NEXT: .LBB14_1: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: v_readfirstlane_b32 s4, v0
-; GFX11-NEXT: v_readfirstlane_b32 s5, v1
-; GFX11-NEXT: v_readfirstlane_b32 s6, v2
-; GFX11-NEXT: v_readfirstlane_b32 s7, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
-; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_and_saveexec_b32 s0, s0
-; GFX11-NEXT: buffer_load_b32 v8, v4, s[4:7], 0 offen offset:1024
-; GFX11-NEXT: ; implicit-def: $vgpr4
-; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB14_1
-; GFX11-NEXT: ; %bb.2:
-; GFX11-NEXT: s_mov_b32 exec_lo, s2
-; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB14_3: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Loop Header: Depth=1
-; GFX11-NEXT: ; Child Loop BB14_4 Depth 2
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_pk_add_f16 v7, v8, v5
-; GFX11-NEXT: s_mov_b32 s2, exec_lo
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_mov_b32_e32 v6, v7
-; GFX11-NEXT: v_mov_b32_e32 v7, v8
-; GFX11-NEXT: .LBB14_4: ; Parent Loop BB14_3 Depth=1
-; GFX11-NEXT: ; => This Inner Loop Header: Depth=2
-; GFX11-NEXT: v_readfirstlane_b32 s4, v0
-; GFX11-NEXT: v_readfirstlane_b32 s5, v1
-; GFX11-NEXT: v_readfirstlane_b32 s6, v2
-; GFX11-NEXT: v_readfirstlane_b32 s7, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
-; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_and_saveexec_b32 s0, s0
+; GFX11-NEXT: s_addk_i32 s4, 0x200
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT: s_and_b32 s5, s4, -4
+; GFX11-NEXT: s_and_b32 s4, s4, 3
+; GFX11-NEXT: v_mov_b32_e32 v3, s5
+; GFX11-NEXT: s_lshl_b32 s4, s4, 3
+; GFX11-NEXT: s_lshl_b32 s5, 0xffff, s4
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_not_b32 s6, s5
+; GFX11-NEXT: buffer_load_b32 v2, v3, s[0:3], 0 offen
+; GFX11-NEXT: s_mov_b32 s5, 0
+; GFX11-NEXT: .LBB14_1: ; %atomicrmw.start
+; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[6:7], v9, s[4:7], 0 offen glc
-; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB14_4
-; GFX11-NEXT: ; %bb.5: ; in Loop: Header=BB14_3 Depth=1
-; GFX11-NEXT: s_mov_b32 exec_lo, s2
+; GFX11-NEXT: v_lshrrev_b32_e32 v1, s4, v2
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_add_f16_e32 v1, v1, v0
+; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_lshlrev_b32_e32 v1, s4, v1
+; GFX11-NEXT: v_and_or_b32 v1, v2, s6, v1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_mov_b32 v4, v1
+; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], 0 offen glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v6, v8
-; GFX11-NEXT: v_mov_b32_e32 v8, v6
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2
+; GFX11-NEXT: v_mov_b32_e32 v2, v4
+; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT: s_cbranch_execnz .LBB14_3
-; GFX11-NEXT: ; %bb.6: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT: v_mov_b32_e32 v0, v6
+; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
+; GFX11-NEXT: s_cbranch_execnz .LBB14_1
+; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s5
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
-; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall:
+; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_add_nc_u32_e32 v9, 0x400, v4
-; GFX10-NEXT: s_mov_b32 s5, 0
-; GFX10-NEXT: s_mov_b32 s6, exec_lo
-; GFX10-NEXT: .LBB14_1: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: v_readfirstlane_b32 s8, v0
-; GFX10-NEXT: v_readfirstlane_b32 s9, v1
-; GFX10-NEXT: v_readfirstlane_b32 s10, v2
-; GFX10-NEXT: v_readfirstlane_b32 s11, v3
-; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[8:9], v[0:1]
-; GFX10-NEXT: v_cmp_eq_u64_e64 s4, s[10:11], v[2:3]
-; GFX10-NEXT: s_and_b32 s4, vcc_lo, s4
-; GFX10-NEXT: s_and_saveexec_b32 s4, s4
-; GFX10-NEXT: buffer_load_dword v8, v4, s[8:11], 0 offen offset:1024
-; GFX10-NEXT: ; implicit-def: $vgpr4
-; GFX10-NEXT: s_waitcnt_depctr 0xffe3
-; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: s_cbranch_execnz .LBB14_1
-; GFX10-NEXT: ; %bb.2:
-; GFX10-NEXT: s_mov_b32 exec_lo, s6
-; GFX10-NEXT: .LBB14_3: ; %atomicrmw.start
-; GFX10-NEXT: ; =>This Loop Header: Depth=1
-; GFX10-NEXT: ; Child Loop BB14_4 Depth 2
+; GFX10-NEXT: s_addk_i32 s8, 0x200
+; GFX10-NEXT: s_and_b32 s9, s8, -4
+; GFX10-NEXT: s_and_b32 s8, s8, 3
+; GFX10-NEXT: v_mov_b32_e32 v3, s9
+; GFX10-NEXT: s_lshl_b32 s8, s8, 3
+; GFX10-NEXT: s_lshl_b32 s9, 0xffff, s8
+; GFX10-NEXT: s_not_b32 s10, s9
+; GFX10-NEXT: buffer_load_dword v2, v3, s[4:7], 0 offen
+; GFX10-NEXT: s_mov_b32 s9, 0
+; GFX10-NEXT: .LBB14_1: ; %atomicrmw.start
+; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_pk_add_f16 v7, v8, v5
-; GFX10-NEXT: s_mov_b32 s6, exec_lo
+; GFX10-NEXT: v_lshrrev_b32_e32 v1, s8, v2
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: v_mov_b32_e32 v6, v7
-; GFX10-NEXT: v_mov_b32_e32 v7, v8
-; GFX10-NEXT: .LBB14_4: ; Parent Loop BB14_3 Depth=1
-; GFX10-NEXT: ; => This Inner Loop Header: Depth=2
-; GFX10-NEXT: v_readfirstlane_b32 s8, v0
-; GFX10-NEXT: v_readfirstlane_b32 s9, v1
-; GFX10-NEXT: v_readfirstlane_b32 s10, v2
-; GFX10-NEXT: v_readfirstlane_b32 s11, v3
-; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[8:9], v[0:1]
-; GFX10-NEXT: v_cmp_eq_u64_e64 s4, s[10:11], v[2:3]
-; GFX10-NEXT: s_and_b32 s4, vcc_lo, s4
-; GFX10-NEXT: s_and_saveexec_b32 s4, s4
-; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: buffer_atomic_cmpswap v[6:7], v9, s[8:11], 0 offen glc
-; GFX10-NEXT: s_waitcnt_depctr 0xffe3
-; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: s_cbranch_execnz .LBB14_4
-; GFX10-NEXT: ; %bb.5: ; in Loop: Header=BB14_3 Depth=1
-; GFX10-NEXT: s_mov_b32 exec_lo, s6
+; GFX10-NEXT: v_add_f16_e32 v1, v1, v0
+; GFX10-NEXT: v_lshlrev_b32_sdwa v1, s8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX10-NEXT: v_and_or_b32 v1, v2, s10, v1
+; GFX10-NEXT: v_mov_b32_e32 v5, v2
+; GFX10-NEXT: v_mov_b32_e32 v4, v1
+; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[4:7], 0 offen glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v6, v8
-; GFX10-NEXT: v_mov_b32_e32 v8, v6
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5
-; GFX10-NEXT: s_waitcnt_depctr 0xffe3
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5
-; GFX10-NEXT: s_cbranch_execnz .LBB14_3
-; GFX10-NEXT: ; %bb.6: ; %atomicrmw.end
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5
-; GFX10-NEXT: v_mov_b32_e32 v0, v6
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2
+; GFX10-NEXT: v_mov_b32_e32 v2, v4
+; GFX10-NEXT: s_or_b32 s9, vcc_lo, s9
+; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s9
+; GFX10-NEXT: s_cbranch_execnz .LBB14_1
+; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s9
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall:
+; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: s_mov_b64 s[6:7], exec
-; GFX90A-NEXT: .LBB14_1: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: v_readfirstlane_b32 s8, v0
-; GFX90A-NEXT: v_readfirstlane_b32 s9, v1
-; GFX90A-NEXT: v_readfirstlane_b32 s10, v2
-; GFX90A-NEXT: v_readfirstlane_b32 s11, v3
-; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
-; GFX90A-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
-; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
+; GFX90A-NEXT: s_addk_i32 s8, 0x200
+; GFX90A-NEXT: s_and_b32 s9, s8, -4
+; GFX90A-NEXT: v_mov_b32_e32 v1, s9
+; GFX90A-NEXT: buffer_load_dword v3, v1, s[4:7], 0 offen
+; GFX90A-NEXT: s_and_b32 s8, s8, 3
+; GFX90A-NEXT: s_lshl_b32 s10, s8, 3
+; GFX90A-NEXT: s_lshl_b32 s8, 0xffff, s10
+; GFX90A-NEXT: s_not_b32 s11, s8
+; GFX90A-NEXT: s_mov_b64 s[8:9], 0
+; GFX90A-NEXT: .LBB14_1: ; %atomicrmw.start
+; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: buffer_atomic_pk_add_f16 v5, v4, s[8:11], 0 offen offset:1024 glc
-; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
-; GFX90A-NEXT: ; implicit-def: $vgpr4
-; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: s_cbranch_execnz .LBB14_1
-; GFX90A-NEXT: ; %bb.2:
-; GFX90A-NEXT: s_mov_b64 exec, s[6:7]
+; GFX90A-NEXT: v_lshrrev_b32_e32 v2, s10, v3
+; GFX90A-NEXT: v_add_f16_e32 v2, v2, v0
+; GFX90A-NEXT: v_lshlrev_b32_e32 v2, s10, v2
+; GFX90A-NEXT: v_and_or_b32 v2, v3, s11, v2
+; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
+; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v1, s[4:7], 0 offen glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v0, v5
; GFX90A-NEXT: buffer_wbinvl1
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX90A-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; GFX90A-NEXT: v_mov_b32_e32 v3, v4
+; GFX90A-NEXT: s_andn2_b64 exec, exec, s[8:9]
+; GFX90A-NEXT: s_cbranch_execnz .LBB14_1
+; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX90A-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
-; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall:
+; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_grained_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_add_u32_e32 v9, 0x400, v4
-; GFX908-NEXT: s_mov_b64 s[6:7], exec
-; GFX908-NEXT: .LBB14_1: ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT: v_readfirstlane_b32 s8, v0
-; GFX908-NEXT: v_readfirstlane_b32 s9, v1
-; GFX908-NEXT: v_readfirstlane_b32 s10, v2
-; GFX908-NEXT: v_readfirstlane_b32 s11, v3
-; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
-; GFX908-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
-; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
-; GFX908-NEXT: s_nop 0
-; GFX908-NEXT: buffer_load_dword v8, v4, s[8:11], 0 offen offset:1024
-; GFX908-NEXT: ; implicit-def: $vgpr4
-; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5]
-; GFX908-NEXT: s_cbranch_execnz .LBB14_1
-; GFX908-NEXT: ; %bb.2:
-; GFX908-NEXT: s_mov_b64 exec, s[6:7]
-; GFX908-NEXT: s_mov_b64 s[6:7], 0
-; GFX908-NEXT: .LBB14_3: ; %atomicrmw.start
-; GFX908-NEXT: ; =>This Loop Header: Depth=1
-; GFX908-NEXT: ; Child Loop BB14_4 Depth 2
-; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_pk_add_f16 v7, v8, v5
-; GFX908-NEXT: v_mov_b32_e32 v6, v7
-; GFX908-NEXT: s_mov_b64 s[12:13], exec
-; GFX908-NEXT: v_mov_b32_e32 v7, v8
-; GFX908-NEXT: .LBB14_4: ; Parent Loop BB14_3 Depth=1
-; GFX908-NEXT: ; => This Inner Loop Header: Depth=2
-; GFX908-NEXT: v_readfirstlane_b32 s8, v0
-; GFX908-NEXT: v_readfirstlane_b32 s9, v1
-; GFX908-NEXT: v_readfirstlane_b32 s10, v2
-; GFX908-NEXT: v_readfirstlane_b32 s11, v3
-; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
-; GFX908-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
-; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
+; GFX908-NEXT: s_addk_i32 s8, 0x200
+; GFX908-NEXT: s_and_b32 s9, s8, -4
+; GFX908-NEXT: v_mov_b32_e32 v3, s9
+; GFX908-NEXT: buffer_load_dword v2, v3, s[4:7], 0 offen
+; GFX908-NEXT: s_and_b32 s8, s8, 3
+; GFX908-NEXT: s_lshl_b32 s10, s8, 3
+; GFX908-NEXT: s_lshl_b32 s8, 0xffff, s10
+; GFX908-NEXT: s_not_b32 s11, s8
+; GFX908-NEXT: s_mov_b64 s[8:9], 0
+; GFX908-NEXT: .LBB14_1: ; %atomicrmw.start
+; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: buffer_atomic_cmpswap v[6:7], v9, s[8:11], 0 offen glc
-; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5]
-; GFX908-NEXT: s_cbranch_execnz .LBB14_4
-; GFX908-NEXT: ; %bb.5: ; in Loop: Header=BB14_3 Depth=1
-; GFX908-NEXT: s_mov_b64 exec, s[12:13]
+; GFX908-NEXT: v_lshrrev_b32_e32 v1, s10, v2
+; GFX908-NEXT: v_add_f16_e32 v1, v1, v0
+; GFX908-NEXT: v_lshlrev_b32_e32 v1, s10, v1
+; GFX908-NEXT: v_and_or_b32 v1, v2, s11, v1
+; GFX908-NEXT: v_mov_b32_e32 v5, v2
+; GFX908-NEXT: v_mov_b32_e32 v4, v1
+; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[4:7], 0 offen glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v6, v8
-; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX908-NEXT: v_mov_b32_e32 v8, v6
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7]
-; GFX908-NEXT: s_cbranch_execnz .LBB14_3
-; GFX908-NEXT: ; %bb.6: ; %atomicrmw.end
-; GFX908-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX908-NEXT: v_mov_b32_e32 v0, v6
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v2
+; GFX908-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; GFX908-NEXT: v_mov_b32_e32 v2, v4
+; GFX908-NEXT: s_andn2_b64 exec, exec, s[8:9]
+; GFX908-NEXT: s_cbranch_execnz .LBB14_1
+; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
-; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall:
+; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_grained_memory:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x400, v4
-; GFX8-NEXT: s_mov_b64 s[6:7], exec
-; GFX8-NEXT: .LBB14_1: ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT: v_readfirstlane_b32 s8, v0
-; GFX8-NEXT: v_readfirstlane_b32 s9, v1
-; GFX8-NEXT: v_readfirstlane_b32 s10, v2
-; GFX8-NEXT: v_readfirstlane_b32 s11, v3
-; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
-; GFX8-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
-; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
-; GFX8-NEXT: s_nop 0
-; GFX8-NEXT: buffer_load_dword v8, v4, s[8:11], 0 offen offset:1024
-; GFX8-NEXT: ; implicit-def: $vgpr4
-; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5]
-; GFX8-NEXT: s_cbranch_execnz .LBB14_1
-; GFX8-NEXT: ; %bb.2:
-; GFX8-NEXT: s_mov_b64 exec, s[6:7]
-; GFX8-NEXT: s_mov_b64 s[6:7], 0
-; GFX8-NEXT: .LBB14_3: ; %atomicrmw.start
-; GFX8-NEXT: ; =>This Loop Header: Depth=1
-; GFX8-NEXT: ; Child Loop BB14_4 Depth 2
+; GFX8-NEXT: s_addk_i32 s8, 0x200
+; GFX8-NEXT: s_and_b32 s9, s8, -4
+; GFX8-NEXT: v_mov_b32_e32 v3, s9
+; GFX8-NEXT: buffer_load_dword v2, v3, s[4:7], 0 offen
+; GFX8-NEXT: s_and_b32 s8, s8, 3
+; GFX8-NEXT: s_lshl_b32 s10, s8, 3
+; GFX8-NEXT: s_lshl_b32 s8, 0xffff, s10
+; GFX8-NEXT: s_not_b32 s11, s8
+; GFX8-NEXT: s_mov_b64 s[8:9], 0
+; GFX8-NEXT: .LBB14_1: ; %atomicrmw.start
+; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_add_f16_sdwa v4, v8, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_add_f16_e32 v6, v8, v5
-; GFX8-NEXT: v_or_b32_e32 v7, v6, v4
-; GFX8-NEXT: v_mov_b32_e32 v6, v7
-; GFX8-NEXT: s_mov_b64 s[12:13], exec
-; GFX8-NEXT: v_mov_b32_e32 v7, v8
-; GFX8-NEXT: .LBB14_4: ; Parent Loop BB14_3 Depth=1
-; GFX8-NEXT: ; => This Inner Loop Header: Depth=2
-; GFX8-NEXT: v_readfirstlane_b32 s8, v0
-; GFX8-NEXT: v_readfirstlane_b32 s9, v1
-; GFX8-NEXT: v_readfirstlane_b32 s10, v2
-; GFX8-NEXT: v_readfirstlane_b32 s11, v3
-; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
-; GFX8-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
-; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
-; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: buffer_atomic_cmpswap v[6:7], v9, s[8:11], 0 offen glc
-; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5]
-; GFX8-NEXT: s_cbranch_execnz .LBB14_4
-; GFX8-NEXT: ; %bb.5: ; in Loop: Header=BB14_3 Depth=1
-; GFX8-NEXT: s_mov_b64 exec, s[12:13]
+; GFX8-NEXT: v_lshrrev_b32_e32 v1, s10, v2
+; GFX8-NEXT: v_add_f16_e32 v1, v1, v0
+; GFX8-NEXT: v_and_b32_e32 v4, s11, v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, s10, v1
+; GFX8-NEXT: v_or_b32_e32 v1, v4, v1
+; GFX8-NEXT: v_mov_b32_e32 v5, v2
+; GFX8-NEXT: v_mov_b32_e32 v4, v1
+; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[4:7], 0 offen glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v6, v8
-; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX8-NEXT: v_mov_b32_e32 v8, v6
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7]
-; GFX8-NEXT: s_cbranch_execnz .LBB14_3
-; GFX8-NEXT: ; %bb.6: ; %atomicrmw.end
-; GFX8-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX8-NEXT: v_mov_b32_e32 v0, v6
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v2
+; GFX8-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; GFX8-NEXT: v_mov_b32_e32 v2, v4
+; GFX8-NEXT: s_andn2_b64 exec, exec, s[8:9]
+; GFX8-NEXT: s_cbranch_execnz .LBB14_1
+; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall:
+; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_grained_memory:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_add_i32_e32 v9, vcc, 0x400, v4
-; GFX7-NEXT: s_mov_b64 s[6:7], exec
-; GFX7-NEXT: .LBB14_1: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: v_readfirstlane_b32 s8, v0
-; GFX7-NEXT: v_readfirstlane_b32 s9, v1
-; GFX7-NEXT: v_readfirstlane_b32 s10, v2
-; GFX7-NEXT: v_readfirstlane_b32 s11, v3
-; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
-; GFX7-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
-; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
-; GFX7-NEXT: buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024
-; GFX7-NEXT: ; implicit-def: $vgpr4
-; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5]
-; GFX7-NEXT: s_cbranch_execnz .LBB14_1
-; GFX7-NEXT: ; %bb.2:
-; GFX7-NEXT: s_mov_b64 exec, s[6:7]
-; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6
-; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v5
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7
-; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7
-; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5
-; GFX7-NEXT: v_cvt_f32_f16_e32 v10, v6
-; GFX7-NEXT: v_cvt_f32_f16_e32 v11, v8
-; GFX7-NEXT: s_mov_b64 s[6:7], 0
-; GFX7-NEXT: .LBB14_3: ; %atomicrmw.start
-; GFX7-NEXT: ; =>This Loop Header: Depth=1
-; GFX7-NEXT: ; Child Loop BB14_4 Depth 2
-; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5
-; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4
-; GFX7-NEXT: s_mov_b64 s[12:13], exec
-; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5
-; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4
-; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX7-NEXT: v_add_f32_e32 v6, v6, v10
-; GFX7-NEXT: v_add_f32_e32 v7, v7, v11
-; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6
-; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7
-; GFX7-NEXT: v_or_b32_e32 v6, v4, v5
-; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8
-; GFX7-NEXT: v_or_b32_e32 v5, v7, v4
-; GFX7-NEXT: v_mov_b32_e32 v8, v6
-; GFX7-NEXT: v_mov_b32_e32 v7, v5
-; GFX7-NEXT: .LBB14_4: ; Parent Loop BB14_3 Depth=1
-; GFX7-NEXT: ; => This Inner Loop Header: Depth=2
-; GFX7-NEXT: v_readfirstlane_b32 s8, v0
-; GFX7-NEXT: v_readfirstlane_b32 s9, v1
-; GFX7-NEXT: v_readfirstlane_b32 s10, v2
-; GFX7-NEXT: v_readfirstlane_b32 s11, v3
-; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
-; GFX7-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
-; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
+; GFX7-NEXT: s_addk_i32 s8, 0x200
+; GFX7-NEXT: s_and_b32 s9, s8, -4
+; GFX7-NEXT: v_mov_b32_e32 v2, s9
+; GFX7-NEXT: buffer_load_dword v1, v2, s[4:7], 0 offen
+; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT: s_and_b32 s8, s8, 3
+; GFX7-NEXT: s_lshl_b32 s10, s8, 3
+; GFX7-NEXT: s_lshl_b32 s8, 0xffff, s10
+; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v0
+; GFX7-NEXT: s_not_b32 s11, s8
+; GFX7-NEXT: s_mov_b64 s[8:9], 0
+; GFX7-NEXT: .LBB14_1: ; %atomicrmw.start
+; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v9, s[8:11], 0 offen glc
-; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5]
-; GFX7-NEXT: s_cbranch_execnz .LBB14_4
-; GFX7-NEXT: ; %bb.5: ; in Loop: Header=BB14_3 Depth=1
-; GFX7-NEXT: s_mov_b64 exec, s[12:13]
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, s10, v1
+; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT: v_and_b32_e32 v4, s11, v1
+; GFX7-NEXT: v_add_f32_e32 v0, v0, v3
+; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, s10, v0
+; GFX7-NEXT: v_or_b32_e32 v0, v4, v0
+; GFX7-NEXT: v_mov_b32_e32 v5, v1
+; GFX7-NEXT: v_mov_b32_e32 v4, v0
+; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[4:7], 0 offen glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7
-; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7
-; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6
-; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7]
-; GFX7-NEXT: s_cbranch_execnz .LBB14_3
-; GFX7-NEXT: ; %bb.6: ; %atomicrmw.end
-; GFX7-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX7-NEXT: v_mov_b32_e32 v0, v4
-; GFX7-NEXT: v_mov_b32_e32 v1, v5
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1
+; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT: v_mov_b32_e32 v1, v4
+; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
+; GFX7-NEXT: s_cbranch_execnz .LBB14_1
+; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
-; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall:
+; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_grained_memory:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT: v_add_i32_e32 v9, vcc, 0x400, v4
-; GFX6-NEXT: s_mov_b64 s[6:7], exec
-; GFX6-NEXT: .LBB14_1: ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT: v_readfirstlane_b32 s8, v0
-; GFX6-NEXT: v_readfirstlane_b32 s9, v1
-; GFX6-NEXT: v_readfirstlane_b32 s10, v2
-; GFX6-NEXT: v_readfirstlane_b32 s11, v3
-; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
-; GFX6-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
-; GFX6-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
-; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
-; GFX6-NEXT: buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024
-; GFX6-NEXT: ; implicit-def: $vgpr4
-; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5]
-; GFX6-NEXT: s_cbranch_execnz .LBB14_1
-; GFX6-NEXT: ; %bb.2:
-; GFX6-NEXT: s_mov_b64 exec, s[6:7]
-; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6
-; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v5
+; GFX6-NEXT: s_addk_i32 s8, 0x200
+; GFX6-NEXT: s_and_b32 s9, s8, -4
+; GFX6-NEXT: v_mov_b32_e32 v2, s9
+; GFX6-NEXT: buffer_load_dword v1, v2, s[4:7], 0 offen
+; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX6-NEXT: s_and_b32 s8, s8, 3
+; GFX6-NEXT: s_lshl_b32 s10, s8, 3
+; GFX6-NEXT: s_lshl_b32 s8, 0xffff, s10
+; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v0
+; GFX6-NEXT: s_not_b32 s11, s8
+; GFX6-NEXT: s_mov_b64 s[8:9], 0
+; GFX6-NEXT: .LBB14_1: ; %atomicrmw.start
+; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v7
-; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v7
-; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5
-; GFX6-NEXT: v_cvt_f32_f16_e32 v10, v6
-; GFX6-NEXT: v_cvt_f32_f16_e32 v11, v8
-; GFX6-NEXT: s_mov_b64 s[6:7], 0
-; GFX6-NEXT: .LBB14_3: ; %atomicrmw.start
-; GFX6-NEXT: ; =>This Loop Header: Depth=1
-; GFX6-NEXT: ; Child Loop BB14_4 Depth 2
-; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5
-; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4
-; GFX6-NEXT: s_mov_b64 s[12:13], exec
-; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v5
+; GFX6-NEXT: v_lshrrev_b32_e32 v0, s10, v1
+; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v4
-; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX6-NEXT: v_add_f32_e32 v6, v6, v10
-; GFX6-NEXT: v_add_f32_e32 v7, v7, v11
-; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v6
-; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v7
-; GFX6-NEXT: v_or_b32_e32 v6, v4, v5
-; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v8
-; GFX6-NEXT: v_or_b32_e32 v5, v7, v4
-; GFX6-NEXT: v_mov_b32_e32 v8, v6
-; GFX6-NEXT: v_mov_b32_e32 v7, v5
-; GFX6-NEXT: .LBB14_4: ; Parent Loop BB14_3 Depth=1
-; GFX6-NEXT: ; => This Inner Loop Header: Depth=2
-; GFX6-NEXT: v_readfirstlane_b32 s8, v0
-; GFX6-NEXT: v_readfirstlane_b32 s9, v1
-; GFX6-NEXT: v_readfirstlane_b32 s10, v2
-; GFX6-NEXT: v_readfirstlane_b32 s11, v3
-; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
-; GFX6-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
-; GFX6-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
-; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
-; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0)
-; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v9, s[8:11], 0 offen glc
-; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5]
-; GFX6-NEXT: s_cbranch_execnz .LBB14_4
-; GFX6-NEXT: ; %bb.5: ; in Loop: Header=BB14_3 Depth=1
-; GFX6-NEXT: s_mov_b64 exec, s[12:13]
+; GFX6-NEXT: v_and_b32_e32 v4, s11, v1
+; GFX6-NEXT: v_add_f32_e32 v0, v0, v3
+; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX6-NEXT: v_lshlrev_b32_e32 v0, s10, v0
+; GFX6-NEXT: v_or_b32_e32 v0, v4, v0
+; GFX6-NEXT: v_mov_b32_e32 v5, v1
+; GFX6-NEXT: v_mov_b32_e32 v4, v0
+; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[4:7], 0 offen glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v7
-; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v7
-; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6
-; GFX6-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: s_andn2_b64 exec, exec, s[6:7]
-; GFX6-NEXT: s_cbranch_execnz .LBB14_3
-; GFX6-NEXT: ; %bb.6: ; %atomicrmw.end
-; GFX6-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX6-NEXT: v_mov_b32_e32 v0, v4
-; GFX6-NEXT: v_mov_b32_e32 v1, v5
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1
+; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT: v_mov_b32_e32 v1, v4
+; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
+; GFX6-NEXT: s_cbranch_execnz .LBB14_1
+; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
- %gep = getelementptr <2 x half>, ptr addrspace(7) %ptr, i32 256
- %result = atomicrmw fadd ptr addrspace(7) %gep, <2 x half> %val syncscope("agent") seq_cst
- ret <2 x half> %result
+ %gep = getelementptr half, ptr addrspace(7) %ptr, i32 256
+ %unused = atomicrmw fadd ptr addrspace(7) %gep, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+ ret void
}
-; --------------------------------------------------------------------
-; <2 x bfloat>
-; --------------------------------------------------------------------
-
-define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset(ptr addrspace(7) inreg %ptr, <2 x bfloat> %val) #0 {
-; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset:
+define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory(ptr addrspace(7) %ptr, half %val) #0 {
+; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: s_wait_expcnt 0x0
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v1, s4
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: buffer_atomic_pk_add_bf16 v0, v1, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset:
-; GFX940: ; %bb.0:
-; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_mov_b32_e32 v1, v0
-; GFX940-NEXT: v_mov_b32_e32 v0, s4
-; GFX940-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024
-; GFX940-NEXT: s_addk_i32 s4, 0x400
-; GFX940-NEXT: s_mov_b64 s[6:7], 0
-; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v1
-; GFX940-NEXT: s_movk_i32 s8, 0x7fff
-; GFX940-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
-; GFX940-NEXT: s_mov_b32 s9, 0x7060302
-; GFX940-NEXT: v_mov_b32_e32 v4, s4
-; GFX940-NEXT: .LBB15_1: ; %atomicrmw.start
-; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT: v_add_nc_u32_e32 v6, 0x200, v4
+; GFX12-NEXT: s_mov_b32 s1, exec_lo
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-NEXT: v_and_b32_e32 v4, 3, v6
+; GFX12-NEXT: v_and_b32_e32 v10, -4, v6
+; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_lshlrev_b32_e64 v7, v4, 0xffff
+; GFX12-NEXT: v_not_b32_e32 v11, v7
+; GFX12-NEXT: .LBB15_1: ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT: v_readfirstlane_b32 s4, v0
+; GFX12-NEXT: v_readfirstlane_b32 s5, v1
+; GFX12-NEXT: v_readfirstlane_b32 s6, v2
+; GFX12-NEXT: v_readfirstlane_b32 s7, v3
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
+; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX12-NEXT: s_and_saveexec_b32 s0, s0
+; GFX12-NEXT: buffer_load_b32 v7, v10, s[4:7], null offen
+; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_cbranch_execnz .LBB15_1
+; GFX12-NEXT: ; %bb.2:
+; GFX12-NEXT: s_mov_b32 exec_lo, s1
+; GFX12-NEXT: s_mov_b32 s1, 0
+; GFX12-NEXT: .LBB15_3: ; %atomicrmw.start
+; GFX12-NEXT: ; =>This Loop Header: Depth=1
+; GFX12-NEXT: ; Child Loop BB15_4 Depth 2
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: v_lshrrev_b32_e32 v6, v4, v7
+; GFX12-NEXT: s_mov_b32 s2, exec_lo
+; GFX12-NEXT: s_wait_storecnt 0x0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_add_f16_e32 v6, v6, v5
+; GFX12-NEXT: v_and_b32_e32 v6, 0xffff, v6
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_lshlrev_b32_e32 v6, v4, v6
+; GFX12-NEXT: v_and_or_b32 v6, v7, v11, v6
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_dual_mov_b32 v9, v7 :: v_dual_mov_b32 v8, v6
+; GFX12-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1
+; GFX12-NEXT: ; => This Inner Loop Header: Depth=2
+; GFX12-NEXT: v_readfirstlane_b32 s4, v0
+; GFX12-NEXT: v_readfirstlane_b32 s5, v1
+; GFX12-NEXT: v_readfirstlane_b32 s6, v2
+; GFX12-NEXT: v_readfirstlane_b32 s7, v3
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
+; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX12-NEXT: s_and_saveexec_b32 s0, s0
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[8:9], v10, s[4:7], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_cbranch_execnz .LBB15_4
+; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1
+; GFX12-NEXT: s_mov_b32 exec_lo, s2
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v8, v7
+; GFX12-NEXT: v_mov_b32_e32 v7, v8
+; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX12-NEXT: s_cbranch_execnz .LBB15_3
+; GFX12-NEXT: ; %bb.6: ; %atomicrmw.end
+; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX12-NEXT: v_lshrrev_b32_e32 v0, v4, v8
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory:
+; GFX940: ; %bb.0:
+; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT: v_add_u32_e32 v4, 0x200, v4
+; GFX940-NEXT: v_and_b32_e32 v10, -4, v4
+; GFX940-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX940-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX940-NEXT: s_mov_b32 s0, 0xffff
+; GFX940-NEXT: v_lshlrev_b32_e64 v6, v4, s0
+; GFX940-NEXT: v_not_b32_e32 v11, v6
+; GFX940-NEXT: s_mov_b64 s[2:3], exec
+; GFX940-NEXT: .LBB15_1: ; =>This Inner Loop Header: Depth=1
+; GFX940-NEXT: v_readfirstlane_b32 s4, v0
+; GFX940-NEXT: v_readfirstlane_b32 s5, v1
+; GFX940-NEXT: v_readfirstlane_b32 s6, v2
+; GFX940-NEXT: v_readfirstlane_b32 s7, v3
+; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1]
+; GFX940-NEXT: s_nop 0
+; GFX940-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
+; GFX940-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
+; GFX940-NEXT: s_and_saveexec_b64 s[0:1], s[0:1]
+; GFX940-NEXT: buffer_load_dword v7, v10, s[4:7], 0 offen
+; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1]
+; GFX940-NEXT: s_cbranch_execnz .LBB15_1
+; GFX940-NEXT: ; %bb.2:
+; GFX940-NEXT: s_mov_b64 exec, s[2:3]
+; GFX940-NEXT: s_mov_b64 s[2:3], 0
+; GFX940-NEXT: .LBB15_3: ; %atomicrmw.start
+; GFX940-NEXT: ; =>This Loop Header: Depth=1
+; GFX940-NEXT: ; Child Loop BB15_4 Depth 2
; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: v_mov_b32_e32 v7, v0
-; GFX940-NEXT: v_lshlrev_b32_e32 v0, 16, v7
-; GFX940-NEXT: v_and_b32_e32 v1, 0xffff0000, v7
-; GFX940-NEXT: v_add_f32_e32 v0, v0, v2
-; GFX940-NEXT: v_add_f32_e32 v1, v1, v3
-; GFX940-NEXT: v_bfe_u32 v5, v0, 16, 1
-; GFX940-NEXT: v_bfe_u32 v8, v1, 16, 1
-; GFX940-NEXT: v_or_b32_e32 v6, 0x400000, v0
-; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v1
-; GFX940-NEXT: v_add3_u32 v5, v5, v0, s8
-; GFX940-NEXT: v_add3_u32 v8, v8, v1, s8
-; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; GFX940-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
+; GFX940-NEXT: v_lshrrev_b32_e32 v6, v4, v7
+; GFX940-NEXT: v_add_f16_e32 v6, v6, v5
+; GFX940-NEXT: v_lshlrev_b32_e32 v6, v4, v6
+; GFX940-NEXT: v_and_or_b32 v6, v7, v11, v6
+; GFX940-NEXT: s_mov_b64 s[8:9], exec
+; GFX940-NEXT: v_mov_b64_e32 v[8:9], v[6:7]
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc
-; GFX940-NEXT: v_cndmask_b32_e64 v0, v5, v6, s[4:5]
-; GFX940-NEXT: v_perm_b32 v6, v1, v0, s9
-; GFX940-NEXT: v_mov_b64_e32 v[0:1], v[6:7]
-; GFX940-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[0:3], 0 offen sc0
+; GFX940-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1
+; GFX940-NEXT: ; => This Inner Loop Header: Depth=2
+; GFX940-NEXT: v_readfirstlane_b32 s4, v0
+; GFX940-NEXT: v_readfirstlane_b32 s5, v1
+; GFX940-NEXT: v_readfirstlane_b32 s6, v2
+; GFX940-NEXT: v_readfirstlane_b32 s7, v3
+; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1]
+; GFX940-NEXT: s_nop 0
+; GFX940-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
+; GFX940-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
+; GFX940-NEXT: s_and_saveexec_b64 s[0:1], s[0:1]
+; GFX940-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NEXT: buffer_atomic_cmpswap v[8:9], v10, s[4:7], 0 offen sc0
+; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1]
+; GFX940-NEXT: s_cbranch_execnz .LBB15_4
+; GFX940-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1
+; GFX940-NEXT: s_mov_b64 exec, s[8:9]
; GFX940-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7
+; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX940-NEXT: v_mov_b32_e32 v7, v8
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v7
-; GFX940-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX940-NEXT: s_andn2_b64 exec, exec, s[6:7]
-; GFX940-NEXT: s_cbranch_execnz .LBB15_1
-; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX940-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX940-NEXT: s_cbranch_execnz .LBB15_3
+; GFX940-NEXT: ; %bb.6: ; %atomicrmw.end
+; GFX940-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX940-NEXT: v_lshrrev_b32_e32 v0, v4, v8
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset:
+; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s4
-; GFX11-NEXT: s_addk_i32 s4, 0x400
-; GFX11-NEXT: s_mov_b32 s5, 0
-; GFX11-NEXT: v_mov_b32_e32 v4, s4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v1
-; GFX11-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024
-; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
-; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX11-NEXT: v_add_nc_u32_e32 v6, 0x200, v4
+; GFX11-NEXT: s_mov_b32 s1, 0
+; GFX11-NEXT: s_mov_b32 s2, exec_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_and_b32_e32 v4, 3, v6
+; GFX11-NEXT: v_and_b32_e32 v10, -4, v6
+; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_lshlrev_b32_e64 v7, v4, 0xffff
+; GFX11-NEXT: v_not_b32_e32 v11, v7
+; GFX11-NEXT: .LBB15_1: ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT: v_readfirstlane_b32 s4, v0
+; GFX11-NEXT: v_readfirstlane_b32 s5, v1
+; GFX11-NEXT: v_readfirstlane_b32 s6, v2
+; GFX11-NEXT: v_readfirstlane_b32 s7, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
+; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX11-NEXT: s_and_saveexec_b32 s0, s0
+; GFX11-NEXT: buffer_load_b32 v7, v10, s[4:7], 0 offen
+; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT: s_cbranch_execnz .LBB15_1
+; GFX11-NEXT: ; %bb.2:
+; GFX11-NEXT: s_mov_b32 exec_lo, s2
; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB15_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT: .LBB15_3: ; %atomicrmw.start
+; GFX11-NEXT: ; =>This Loop Header: Depth=1
+; GFX11-NEXT: ; Child Loop BB15_4 Depth 2
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v6, v0
+; GFX11-NEXT: v_lshrrev_b32_e32 v6, v4, v7
+; GFX11-NEXT: s_mov_b32 s2, exec_lo
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v6
-; GFX11-NEXT: v_add_f32_e32 v1, v1, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_bfe_u32 v7, v1, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v1
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX11-NEXT: v_add3_u32 v7, v7, v1, 0x7fff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_dual_cndmask_b32 v1, v7, v9 :: v_dual_lshlrev_b32 v0, 16, v6
-; GFX11-NEXT: v_add_f32_e32 v0, v0, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_bfe_u32 v5, v0, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0
-; GFX11-NEXT: v_cmp_u_f32_e64 s4, v0, v0
-; GFX11-NEXT: v_add3_u32 v5, v5, v0, 0x7fff
+; GFX11-NEXT: v_add_f16_e32 v6, v6, v5
+; GFX11-NEXT: v_and_b32_e32 v6, 0xffff, v6
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e64 v0, v5, v8, s4
-; GFX11-NEXT: v_perm_b32 v5, v1, v0, 0x7060302
+; GFX11-NEXT: v_lshlrev_b32_e32 v6, v4, v6
+; GFX11-NEXT: v_and_or_b32 v6, v7, v11, v6
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_dual_mov_b32 v0, v5 :: v_dual_mov_b32 v1, v6
-; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v4, s[0:3], 0 offen glc
+; GFX11-NEXT: v_dual_mov_b32 v9, v7 :: v_dual_mov_b32 v8, v6
+; GFX11-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1
+; GFX11-NEXT: ; => This Inner Loop Header: Depth=2
+; GFX11-NEXT: v_readfirstlane_b32 s4, v0
+; GFX11-NEXT: v_readfirstlane_b32 s5, v1
+; GFX11-NEXT: v_readfirstlane_b32 s6, v2
+; GFX11-NEXT: v_readfirstlane_b32 s7, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
+; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX11-NEXT: s_and_saveexec_b32 s0, s0
; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[8:9], v10, s[4:7], 0 offen glc
+; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT: s_cbranch_execnz .LBB15_4
+; GFX11-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1
+; GFX11-NEXT: s_mov_b32 exec_lo, s2
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v8, v7
+; GFX11-NEXT: v_mov_b32_e32 v7, v8
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6
-; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5
+; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
-; GFX11-NEXT: s_cbranch_execnz .LBB15_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_set_inst_prefetch_distance 0x2
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s5
+; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX11-NEXT: s_cbranch_execnz .LBB15_3
+; GFX11-NEXT: ; %bb.6: ; %atomicrmw.end
+; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX11-NEXT: v_lshrrev_b32_e32 v0, v4, v8
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
-; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset:
+; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v1, v0
-; GFX10-NEXT: v_mov_b32_e32 v0, s8
-; GFX10-NEXT: s_addk_i32 s8, 0x400
-; GFX10-NEXT: s_mov_b32 s9, 0
-; GFX10-NEXT: v_mov_b32_e32 v4, s8
-; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v1
-; GFX10-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024
-; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
-; GFX10-NEXT: .LBB15_1: ; %atomicrmw.start
-; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v6, v0
-; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v6
-; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v6
-; GFX10-NEXT: v_add_f32_e32 v0, v0, v2
-; GFX10-NEXT: v_add_f32_e32 v1, v1, v3
-; GFX10-NEXT: v_bfe_u32 v5, v0, 16, 1
-; GFX10-NEXT: v_bfe_u32 v7, v1, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v0
-; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v1
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
-; GFX10-NEXT: v_add3_u32 v5, v5, v0, 0x7fff
-; GFX10-NEXT: v_add3_u32 v7, v7, v1, 0x7fff
-; GFX10-NEXT: v_cmp_u_f32_e64 s8, v0, v0
-; GFX10-NEXT: v_cndmask_b32_e32 v1, v7, v9, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e64 v0, v5, v8, s8
-; GFX10-NEXT: v_perm_b32 v5, v1, v0, 0x7060302
-; GFX10-NEXT: v_mov_b32_e32 v0, v5
-; GFX10-NEXT: v_mov_b32_e32 v1, v6
-; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[4:7], 0 offen glc
+; GFX10-NEXT: v_add_nc_u32_e32 v6, 0x200, v4
+; GFX10-NEXT: s_mov_b32 s5, 0
+; GFX10-NEXT: s_mov_b32 s6, exec_lo
+; GFX10-NEXT: v_and_b32_e32 v4, 3, v6
+; GFX10-NEXT: v_and_b32_e32 v10, -4, v6
+; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX10-NEXT: v_lshlrev_b32_e64 v7, v4, 0xffff
+; GFX10-NEXT: v_not_b32_e32 v11, v7
+; GFX10-NEXT: .LBB15_1: ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT: v_readfirstlane_b32 s8, v0
+; GFX10-NEXT: v_readfirstlane_b32 s9, v1
+; GFX10-NEXT: v_readfirstlane_b32 s10, v2
+; GFX10-NEXT: v_readfirstlane_b32 s11, v3
+; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[8:9], v[0:1]
+; GFX10-NEXT: v_cmp_eq_u64_e64 s4, s[10:11], v[2:3]
+; GFX10-NEXT: s_and_b32 s4, vcc_lo, s4
+; GFX10-NEXT: s_and_saveexec_b32 s4, s4
+; GFX10-NEXT: buffer_load_dword v7, v10, s[8:11], 0 offen
+; GFX10-NEXT: s_waitcnt_depctr 0xffe3
+; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT: s_cbranch_execnz .LBB15_1
+; GFX10-NEXT: ; %bb.2:
+; GFX10-NEXT: s_mov_b32 exec_lo, s6
+; GFX10-NEXT: .LBB15_3: ; %atomicrmw.start
+; GFX10-NEXT: ; =>This Loop Header: Depth=1
+; GFX10-NEXT: ; Child Loop BB15_4 Depth 2
; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_lshrrev_b32_e32 v6, v4, v7
+; GFX10-NEXT: s_mov_b32 s6, exec_lo
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: v_add_f16_e32 v6, v6, v5
+; GFX10-NEXT: v_lshlrev_b32_sdwa v6, v4, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX10-NEXT: v_and_or_b32 v6, v7, v11, v6
+; GFX10-NEXT: v_mov_b32_e32 v9, v7
+; GFX10-NEXT: v_mov_b32_e32 v8, v6
+; GFX10-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1
+; GFX10-NEXT: ; => This Inner Loop Header: Depth=2
+; GFX10-NEXT: v_readfirstlane_b32 s8, v0
+; GFX10-NEXT: v_readfirstlane_b32 s9, v1
+; GFX10-NEXT: v_readfirstlane_b32 s10, v2
+; GFX10-NEXT: v_readfirstlane_b32 s11, v3
+; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[8:9], v[0:1]
+; GFX10-NEXT: v_cmp_eq_u64_e64 s4, s[10:11], v[2:3]
+; GFX10-NEXT: s_and_b32 s4, vcc_lo, s4
+; GFX10-NEXT: s_and_saveexec_b32 s4, s4
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: buffer_atomic_cmpswap v[8:9], v10, s[8:11], 0 offen glc
+; GFX10-NEXT: s_waitcnt_depctr 0xffe3
+; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT: s_cbranch_execnz .LBB15_4
+; GFX10-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1
+; GFX10-NEXT: s_mov_b32 exec_lo, s6
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v8, v7
+; GFX10-NEXT: v_mov_b32_e32 v7, v8
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6
-; GFX10-NEXT: s_or_b32 s9, vcc_lo, s9
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s9
-; GFX10-NEXT: s_cbranch_execnz .LBB15_1
-; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s9
+; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5
+; GFX10-NEXT: s_waitcnt_depctr 0xffe3
+; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5
+; GFX10-NEXT: s_cbranch_execnz .LBB15_3
+; GFX10-NEXT: ; %bb.6: ; %atomicrmw.end
+; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5
+; GFX10-NEXT: v_lshrrev_b32_e32 v0, v4, v8
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset:
+; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v1, v0
-; GFX90A-NEXT: v_mov_b32_e32 v0, s8
-; GFX90A-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024
-; GFX90A-NEXT: s_addk_i32 s8, 0x400
-; GFX90A-NEXT: s_mov_b64 s[10:11], 0
-; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v1
-; GFX90A-NEXT: s_movk_i32 s12, 0x7fff
-; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
-; GFX90A-NEXT: s_mov_b32 s13, 0x7060302
-; GFX90A-NEXT: v_mov_b32_e32 v4, s8
-; GFX90A-NEXT: .LBB15_1: ; %atomicrmw.start
-; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT: v_add_u32_e32 v4, 0x200, v4
+; GFX90A-NEXT: v_and_b32_e32 v10, -4, v4
+; GFX90A-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX90A-NEXT: s_mov_b32 s4, 0xffff
+; GFX90A-NEXT: v_lshlrev_b32_e64 v6, v4, s4
+; GFX90A-NEXT: v_not_b32_e32 v11, v6
+; GFX90A-NEXT: s_mov_b64 s[6:7], exec
+; GFX90A-NEXT: .LBB15_1: ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT: v_readfirstlane_b32 s8, v0
+; GFX90A-NEXT: v_readfirstlane_b32 s9, v1
+; GFX90A-NEXT: v_readfirstlane_b32 s10, v2
+; GFX90A-NEXT: v_readfirstlane_b32 s11, v3
+; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
+; GFX90A-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
+; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
+; GFX90A-NEXT: s_nop 0
+; GFX90A-NEXT: buffer_load_dword v7, v10, s[8:11], 0 offen
+; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: s_cbranch_execnz .LBB15_1
+; GFX90A-NEXT: ; %bb.2:
+; GFX90A-NEXT: s_mov_b64 exec, s[6:7]
+; GFX90A-NEXT: s_mov_b64 s[6:7], 0
+; GFX90A-NEXT: .LBB15_3: ; %atomicrmw.start
+; GFX90A-NEXT: ; =>This Loop Header: Depth=1
+; GFX90A-NEXT: ; Child Loop BB15_4 Depth 2
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v7, v0
-; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v7
-; GFX90A-NEXT: v_and_b32_e32 v1, 0xffff0000, v7
-; GFX90A-NEXT: v_add_f32_e32 v0, v0, v2
-; GFX90A-NEXT: v_add_f32_e32 v1, v1, v3
-; GFX90A-NEXT: v_bfe_u32 v5, v0, 16, 1
-; GFX90A-NEXT: v_bfe_u32 v8, v1, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v0
-; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v1
-; GFX90A-NEXT: v_add3_u32 v5, v5, v0, s12
-; GFX90A-NEXT: v_add3_u32 v8, v8, v1, s12
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; GFX90A-NEXT: v_cmp_u_f32_e64 s[8:9], v0, v0
-; GFX90A-NEXT: v_cndmask_b32_e64 v0, v5, v6, s[8:9]
-; GFX90A-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc
-; GFX90A-NEXT: v_perm_b32 v6, v1, v0, s13
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[6:7] op_sel:[0,1]
-; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[4:7], 0 offen glc
+; GFX90A-NEXT: v_lshrrev_b32_e32 v6, v4, v7
+; GFX90A-NEXT: v_add_f16_e32 v6, v6, v5
+; GFX90A-NEXT: v_lshlrev_b32_e32 v6, v4, v6
+; GFX90A-NEXT: v_and_or_b32 v6, v7, v11, v6
+; GFX90A-NEXT: s_mov_b64 s[12:13], exec
+; GFX90A-NEXT: v_pk_mov_b32 v[8:9], v[6:7], v[6:7] op_sel:[0,1]
+; GFX90A-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1
+; GFX90A-NEXT: ; => This Inner Loop Header: Depth=2
+; GFX90A-NEXT: v_readfirstlane_b32 s8, v0
+; GFX90A-NEXT: v_readfirstlane_b32 s9, v1
+; GFX90A-NEXT: v_readfirstlane_b32 s10, v2
+; GFX90A-NEXT: v_readfirstlane_b32 s11, v3
+; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
+; GFX90A-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
+; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: buffer_atomic_cmpswap v[8:9], v10, s[8:11], 0 offen glc
+; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: s_cbranch_execnz .LBB15_4
+; GFX90A-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1
+; GFX90A-NEXT: s_mov_b64 exec, s[12:13]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7
+; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GFX90A-NEXT: v_mov_b32_e32 v7, v8
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v7
-; GFX90A-NEXT: s_or_b64 s[10:11], vcc, s[10:11]
-; GFX90A-NEXT: s_andn2_b64 exec, exec, s[10:11]
-; GFX90A-NEXT: s_cbranch_execnz .LBB15_1
-; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX90A-NEXT: s_or_b64 exec, exec, s[10:11]
+; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
+; GFX90A-NEXT: s_cbranch_execnz .LBB15_3
+; GFX90A-NEXT: ; %bb.6: ; %atomicrmw.end
+; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v4, v8
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
-; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset:
+; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v1, v0
-; GFX908-NEXT: v_mov_b32_e32 v0, s8
-; GFX908-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024
-; GFX908-NEXT: s_addk_i32 s8, 0x400
-; GFX908-NEXT: s_mov_b64 s[10:11], 0
-; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v1
-; GFX908-NEXT: s_movk_i32 s12, 0x7fff
-; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
-; GFX908-NEXT: s_mov_b32 s13, 0x7060302
-; GFX908-NEXT: v_mov_b32_e32 v4, s8
-; GFX908-NEXT: .LBB15_1: ; %atomicrmw.start
-; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT: v_add_u32_e32 v4, 0x200, v4
+; GFX908-NEXT: v_and_b32_e32 v10, -4, v4
+; GFX908-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX908-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX908-NEXT: s_mov_b32 s4, 0xffff
+; GFX908-NEXT: v_lshlrev_b32_e64 v6, v4, s4
+; GFX908-NEXT: v_not_b32_e32 v11, v6
+; GFX908-NEXT: s_mov_b64 s[6:7], exec
+; GFX908-NEXT: .LBB15_1: ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT: v_readfirstlane_b32 s8, v0
+; GFX908-NEXT: v_readfirstlane_b32 s9, v1
+; GFX908-NEXT: v_readfirstlane_b32 s10, v2
+; GFX908-NEXT: v_readfirstlane_b32 s11, v3
+; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
+; GFX908-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
+; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
+; GFX908-NEXT: s_nop 0
+; GFX908-NEXT: buffer_load_dword v7, v10, s[8:11], 0 offen
+; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5]
+; GFX908-NEXT: s_cbranch_execnz .LBB15_1
+; GFX908-NEXT: ; %bb.2:
+; GFX908-NEXT: s_mov_b64 exec, s[6:7]
+; GFX908-NEXT: s_mov_b64 s[6:7], 0
+; GFX908-NEXT: .LBB15_3: ; %atomicrmw.start
+; GFX908-NEXT: ; =>This Loop Header: Depth=1
+; GFX908-NEXT: ; Child Loop BB15_4 Depth 2
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v6, v0
-; GFX908-NEXT: v_lshlrev_b32_e32 v0, 16, v6
-; GFX908-NEXT: v_and_b32_e32 v1, 0xffff0000, v6
-; GFX908-NEXT: v_add_f32_e32 v0, v0, v2
-; GFX908-NEXT: v_add_f32_e32 v1, v1, v3
-; GFX908-NEXT: v_bfe_u32 v5, v0, 16, 1
-; GFX908-NEXT: v_bfe_u32 v8, v1, 16, 1
-; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v0
-; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v1
-; GFX908-NEXT: v_add3_u32 v5, v5, v0, s12
-; GFX908-NEXT: v_add3_u32 v8, v8, v1, s12
-; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
-; GFX908-NEXT: v_cmp_u_f32_e64 s[8:9], v0, v0
-; GFX908-NEXT: v_cndmask_b32_e64 v0, v5, v7, s[8:9]
-; GFX908-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc
-; GFX908-NEXT: v_perm_b32 v5, v1, v0, s13
-; GFX908-NEXT: v_mov_b32_e32 v0, v5
-; GFX908-NEXT: v_mov_b32_e32 v1, v6
-; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[4:7], 0 offen glc
+; GFX908-NEXT: v_lshrrev_b32_e32 v6, v4, v7
+; GFX908-NEXT: v_add_f16_e32 v6, v6, v5
+; GFX908-NEXT: v_lshlrev_b32_e32 v6, v4, v6
+; GFX908-NEXT: v_and_or_b32 v6, v7, v11, v6
+; GFX908-NEXT: v_mov_b32_e32 v9, v7
+; GFX908-NEXT: s_mov_b64 s[12:13], exec
+; GFX908-NEXT: v_mov_b32_e32 v8, v6
+; GFX908-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1
+; GFX908-NEXT: ; => This Inner Loop Header: Depth=2
+; GFX908-NEXT: v_readfirstlane_b32 s8, v0
+; GFX908-NEXT: v_readfirstlane_b32 s9, v1
+; GFX908-NEXT: v_readfirstlane_b32 s10, v2
+; GFX908-NEXT: v_readfirstlane_b32 s11, v3
+; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
+; GFX908-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
+; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: buffer_atomic_cmpswap v[8:9], v10, s[8:11], 0 offen glc
+; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5]
+; GFX908-NEXT: s_cbranch_execnz .LBB15_4
+; GFX908-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1
+; GFX908-NEXT: s_mov_b64 exec, s[12:13]
; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7
+; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GFX908-NEXT: v_mov_b32_e32 v7, v8
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6
-; GFX908-NEXT: s_or_b64 s[10:11], vcc, s[10:11]
-; GFX908-NEXT: s_andn2_b64 exec, exec, s[10:11]
-; GFX908-NEXT: s_cbranch_execnz .LBB15_1
-; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX908-NEXT: s_or_b64 exec, exec, s[10:11]
+; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7]
+; GFX908-NEXT: s_cbranch_execnz .LBB15_3
+; GFX908-NEXT: ; %bb.6: ; %atomicrmw.end
+; GFX908-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX908-NEXT: v_lshrrev_b32_e32 v0, v4, v8
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
-; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset:
+; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, v0
-; GFX8-NEXT: v_mov_b32_e32 v0, s8
-; GFX8-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024
-; GFX8-NEXT: s_addk_i32 s8, 0x400
-; GFX8-NEXT: s_mov_b64 s[10:11], 0
-; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v1
-; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
-; GFX8-NEXT: v_mov_b32_e32 v4, s8
-; GFX8-NEXT: .LBB15_1: ; %atomicrmw.start
-; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v6, v0
-; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v6
+; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x200, v4
+; GFX8-NEXT: v_and_b32_e32 v10, -4, v4
+; GFX8-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX8-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX8-NEXT: s_mov_b32 s4, 0xffff
+; GFX8-NEXT: v_lshlrev_b32_e64 v6, v4, s4
+; GFX8-NEXT: v_not_b32_e32 v11, v6
+; GFX8-NEXT: s_mov_b64 s[6:7], exec
+; GFX8-NEXT: .LBB15_1: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT: v_readfirstlane_b32 s8, v0
+; GFX8-NEXT: v_readfirstlane_b32 s9, v1
+; GFX8-NEXT: v_readfirstlane_b32 s10, v2
+; GFX8-NEXT: v_readfirstlane_b32 s11, v3
+; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
+; GFX8-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
+; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
+; GFX8-NEXT: s_nop 0
+; GFX8-NEXT: buffer_load_dword v7, v10, s[8:11], 0 offen
+; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5]
+; GFX8-NEXT: s_cbranch_execnz .LBB15_1
+; GFX8-NEXT: ; %bb.2:
+; GFX8-NEXT: s_mov_b64 exec, s[6:7]
+; GFX8-NEXT: s_mov_b64 s[6:7], 0
+; GFX8-NEXT: .LBB15_3: ; %atomicrmw.start
+; GFX8-NEXT: ; =>This Loop Header: Depth=1
+; GFX8-NEXT: ; Child Loop BB15_4 Depth 2
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_lshrrev_b32_e32 v6, v4, v7
+; GFX8-NEXT: v_add_f16_e32 v6, v6, v5
+; GFX8-NEXT: v_lshlrev_b32_e32 v6, v4, v6
+; GFX8-NEXT: v_and_b32_e32 v8, v7, v11
+; GFX8-NEXT: v_or_b32_e32 v6, v8, v6
+; GFX8-NEXT: v_mov_b32_e32 v9, v7
+; GFX8-NEXT: s_mov_b64 s[12:13], exec
+; GFX8-NEXT: v_mov_b32_e32 v8, v6
+; GFX8-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1
+; GFX8-NEXT: ; => This Inner Loop Header: Depth=2
+; GFX8-NEXT: v_readfirstlane_b32 s8, v0
+; GFX8-NEXT: v_readfirstlane_b32 s9, v1
+; GFX8-NEXT: v_readfirstlane_b32 s10, v2
+; GFX8-NEXT: v_readfirstlane_b32 s11, v3
+; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
+; GFX8-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
+; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: buffer_atomic_cmpswap v[8:9], v10, s[8:11], 0 offen glc
+; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5]
+; GFX8-NEXT: s_cbranch_execnz .LBB15_4
+; GFX8-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1
+; GFX8-NEXT: s_mov_b64 exec, s[12:13]
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7
+; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GFX8-NEXT: v_mov_b32_e32 v7, v8
+; GFX8-NEXT: buffer_wbinvl1
+; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7]
+; GFX8-NEXT: s_cbranch_execnz .LBB15_3
+; GFX8-NEXT: ; %bb.6: ; %atomicrmw.end
+; GFX8-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX8-NEXT: v_lshrrev_b32_e32 v0, v4, v8
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x200, v4
+; GFX7-NEXT: v_and_b32_e32 v8, -4, v4
+; GFX7-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v7, 3, v4
+; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v7
+; GFX7-NEXT: v_not_b32_e32 v9, v4
+; GFX7-NEXT: s_mov_b64 s[6:7], exec
+; GFX7-NEXT: .LBB15_1: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: v_readfirstlane_b32 s8, v0
+; GFX7-NEXT: v_readfirstlane_b32 s9, v1
+; GFX7-NEXT: v_readfirstlane_b32 s10, v2
+; GFX7-NEXT: v_readfirstlane_b32 s11, v3
+; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
+; GFX7-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
+; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
+; GFX7-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen
+; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5]
+; GFX7-NEXT: s_cbranch_execnz .LBB15_1
+; GFX7-NEXT: ; %bb.2:
+; GFX7-NEXT: s_mov_b64 exec, s[6:7]
+; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v5
+; GFX7-NEXT: s_mov_b64 s[6:7], 0
+; GFX7-NEXT: v_cvt_f32_f16_e32 v10, v4
+; GFX7-NEXT: .LBB15_3: ; %atomicrmw.start
+; GFX7-NEXT: ; =>This Loop Header: Depth=1
+; GFX7-NEXT: ; Child Loop BB15_4 Depth 2
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_lshrrev_b32_e32 v4, v7, v6
+; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4
+; GFX7-NEXT: v_and_b32_e32 v5, v6, v9
+; GFX7-NEXT: s_mov_b64 s[12:13], exec
+; GFX7-NEXT: v_add_f32_e32 v4, v4, v10
+; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v4, v7, v4
+; GFX7-NEXT: v_or_b32_e32 v5, v5, v4
+; GFX7-NEXT: v_mov_b32_e32 v4, v5
+; GFX7-NEXT: v_mov_b32_e32 v5, v6
+; GFX7-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1
+; GFX7-NEXT: ; => This Inner Loop Header: Depth=2
+; GFX7-NEXT: v_readfirstlane_b32 s8, v0
+; GFX7-NEXT: v_readfirstlane_b32 s9, v1
+; GFX7-NEXT: v_readfirstlane_b32 s10, v2
+; GFX7-NEXT: v_readfirstlane_b32 s11, v3
+; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
+; GFX7-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
+; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc
+; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5]
+; GFX7-NEXT: s_cbranch_execnz .LBB15_4
+; GFX7-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1
+; GFX7-NEXT: s_mov_b64 exec, s[12:13]
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6
+; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v6, v4
+; GFX7-NEXT: buffer_wbinvl1
+; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7]
+; GFX7-NEXT: s_cbranch_execnz .LBB15_3
+; GFX7-NEXT: ; %bb.6: ; %atomicrmw.end
+; GFX7-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, v7, v4
+; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0x200, v4
+; GFX6-NEXT: v_and_b32_e32 v8, -4, v4
+; GFX6-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX6-NEXT: v_lshlrev_b32_e32 v7, 3, v4
+; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v7
+; GFX6-NEXT: v_not_b32_e32 v9, v4
+; GFX6-NEXT: s_mov_b64 s[6:7], exec
+; GFX6-NEXT: .LBB15_1: ; =>This Inner Loop Header: Depth=1
+; GFX6-NEXT: v_readfirstlane_b32 s8, v0
+; GFX6-NEXT: v_readfirstlane_b32 s9, v1
+; GFX6-NEXT: v_readfirstlane_b32 s10, v2
+; GFX6-NEXT: v_readfirstlane_b32 s11, v3
+; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
+; GFX6-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
+; GFX6-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
+; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
+; GFX6-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen
+; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5]
+; GFX6-NEXT: s_cbranch_execnz .LBB15_1
+; GFX6-NEXT: ; %bb.2:
+; GFX6-NEXT: s_mov_b64 exec, s[6:7]
+; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v5
+; GFX6-NEXT: s_mov_b64 s[6:7], 0
+; GFX6-NEXT: v_cvt_f32_f16_e32 v10, v4
+; GFX6-NEXT: .LBB15_3: ; %atomicrmw.start
+; GFX6-NEXT: ; =>This Loop Header: Depth=1
+; GFX6-NEXT: ; Child Loop BB15_4 Depth 2
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-NEXT: v_lshrrev_b32_e32 v4, v7, v6
+; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4
+; GFX6-NEXT: v_and_b32_e32 v5, v6, v9
+; GFX6-NEXT: s_mov_b64 s[12:13], exec
+; GFX6-NEXT: v_add_f32_e32 v4, v4, v10
+; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4
+; GFX6-NEXT: v_lshlrev_b32_e32 v4, v7, v4
+; GFX6-NEXT: v_or_b32_e32 v5, v5, v4
+; GFX6-NEXT: v_mov_b32_e32 v4, v5
+; GFX6-NEXT: v_mov_b32_e32 v5, v6
+; GFX6-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1
+; GFX6-NEXT: ; => This Inner Loop Header: Depth=2
+; GFX6-NEXT: v_readfirstlane_b32 s8, v0
+; GFX6-NEXT: v_readfirstlane_b32 s9, v1
+; GFX6-NEXT: v_readfirstlane_b32 s10, v2
+; GFX6-NEXT: v_readfirstlane_b32 s11, v3
+; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
+; GFX6-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
+; GFX6-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
+; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc
+; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5]
+; GFX6-NEXT: s_cbranch_execnz .LBB15_4
+; GFX6-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1
+; GFX6-NEXT: s_mov_b64 exec, s[12:13]
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6
+; GFX6-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v6, v4
+; GFX6-NEXT: buffer_wbinvl1
+; GFX6-NEXT: s_andn2_b64 exec, exec, s[6:7]
+; GFX6-NEXT: s_cbranch_execnz .LBB15_3
+; GFX6-NEXT: ; %bb.6: ; %atomicrmw.end
+; GFX6-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX6-NEXT: v_lshrrev_b32_e32 v0, v7, v4
+; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: s_setpc_b64 s[30:31]
+ %gep = getelementptr half, ptr addrspace(7) %ptr, i32 256
+ %result = atomicrmw fadd ptr addrspace(7) %gep, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+ ret half %result
+}
+
+; --------------------------------------------------------------------
+; bfloat
+; --------------------------------------------------------------------
+
+define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, bfloat %val) #0 {
+; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine_grained_memory:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: s_addk_co_i32 s4, 0x200
+; GFX12-NEXT: v_lshlrev_b32_e32 v5, 16, v0
+; GFX12-NEXT: s_and_b32 s5, s4, -4
+; GFX12-NEXT: s_and_b32 s4, s4, 3
+; GFX12-NEXT: v_mov_b32_e32 v4, s5
+; GFX12-NEXT: s_lshl_b32 s4, s4, 3
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_lshl_b32 s5, 0xffff, s4
+; GFX12-NEXT: s_not_b32 s6, s5
+; GFX12-NEXT: buffer_load_b32 v1, v4, s[0:3], null offen
+; GFX12-NEXT: s_mov_b32 s5, 0
+; GFX12-NEXT: .LBB16_1: ; %atomicrmw.start
+; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v1
+; GFX12-NEXT: s_wait_storecnt 0x0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX12-NEXT: v_add_f32_e32 v0, v0, v5
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX12-NEXT: v_or_b32_e32 v3, 0x400000, v0
+; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX12-NEXT: v_add3_u32 v2, v2, v0, 0x7fff
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo
+; GFX12-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_lshlrev_b32_e32 v0, s4, v0
+; GFX12-NEXT: v_and_or_b32 v0, v1, s6, v0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
+; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
+; GFX12-NEXT: v_mov_b32_e32 v1, v2
+; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
+; GFX12-NEXT: s_cbranch_execnz .LBB16_1
+; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5
+; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v2
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine_grained_memory:
+; GFX940: ; %bb.0:
+; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT: s_addk_i32 s4, 0x200
+; GFX940-NEXT: s_and_b32 s5, s4, -4
+; GFX940-NEXT: v_mov_b32_e32 v4, s5
+; GFX940-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen
+; GFX940-NEXT: s_and_b32 s4, s4, 3
+; GFX940-NEXT: s_lshl_b32 s6, s4, 3
+; GFX940-NEXT: s_lshl_b32 s4, 0xffff, s6
+; GFX940-NEXT: s_not_b32 s7, s4
+; GFX940-NEXT: s_mov_b64 s[4:5], 0
+; GFX940-NEXT: v_lshlrev_b32_e32 v5, 16, v0
+; GFX940-NEXT: s_movk_i32 s8, 0x7fff
+; GFX940-NEXT: .LBB16_1: ; %atomicrmw.start
+; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX940-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NEXT: v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX940-NEXT: buffer_wbl2 sc1
+; GFX940-NEXT: v_add_f32_e32 v0, v0, v5
+; GFX940-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX940-NEXT: v_or_b32_e32 v3, 0x400000, v0
+; GFX940-NEXT: v_add3_u32 v2, v2, v0, s8
+; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
+; GFX940-NEXT: s_nop 1
+; GFX940-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
+; GFX940-NEXT: v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX940-NEXT: v_and_or_b32 v0, v1, s7, v0
+; GFX940-NEXT: v_mov_b64_e32 v[2:3], v[0:1]
+; GFX940-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[0:3], 0 offen sc0
+; GFX940-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NEXT: buffer_inv sc1
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX940-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX940-NEXT: v_mov_b32_e32 v1, v2
+; GFX940-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX940-NEXT: s_cbranch_execnz .LBB16_1
+; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX940-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX940-NEXT: v_lshrrev_b32_e32 v0, s6, v2
+; GFX940-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine_grained_memory:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_addk_i32 s4, 0x200
+; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v0
+; GFX11-NEXT: s_and_b32 s5, s4, -4
+; GFX11-NEXT: s_and_b32 s4, s4, 3
+; GFX11-NEXT: v_mov_b32_e32 v4, s5
+; GFX11-NEXT: s_lshl_b32 s4, s4, 3
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT: s_lshl_b32 s5, 0xffff, s4
+; GFX11-NEXT: s_not_b32 s6, s5
+; GFX11-NEXT: buffer_load_b32 v1, v4, s[0:3], 0 offen
+; GFX11-NEXT: s_mov_b32 s5, 0
+; GFX11-NEXT: .p2align 6
+; GFX11-NEXT: .LBB16_1: ; %atomicrmw.start
+; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_lshrrev_b32_e32 v0, s4, v1
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-NEXT: v_add_f32_e32 v0, v0, v5
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX11-NEXT: v_or_b32_e32 v3, 0x400000, v0
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-NEXT: v_add3_u32 v2, v2, v0, 0x7fff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo
+; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_lshlrev_b32_e32 v0, s4, v0
+; GFX11-NEXT: v_and_or_b32 v0, v1, s6, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
+; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], 0 offen glc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: buffer_gl1_inv
+; GFX11-NEXT: buffer_gl0_inv
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
+; GFX11-NEXT: v_mov_b32_e32 v1, v2
+; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
+; GFX11-NEXT: s_cbranch_execnz .LBB16_1
+; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s5
+; GFX11-NEXT: v_lshrrev_b32_e32 v0, s4, v2
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine_grained_memory:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_addk_i32 s8, 0x200
+; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v0
+; GFX10-NEXT: s_and_b32 s9, s8, -4
+; GFX10-NEXT: s_and_b32 s8, s8, 3
+; GFX10-NEXT: v_mov_b32_e32 v4, s9
+; GFX10-NEXT: s_lshl_b32 s8, s8, 3
+; GFX10-NEXT: s_lshl_b32 s9, 0xffff, s8
+; GFX10-NEXT: s_not_b32 s10, s9
+; GFX10-NEXT: buffer_load_dword v1, v4, s[4:7], 0 offen
+; GFX10-NEXT: s_mov_b32 s9, 0
+; GFX10-NEXT: .LBB16_1: ; %atomicrmw.start
+; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_lshrrev_b32_sdwa v0, s8, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: v_add_f32_e32 v0, v0, v5
+; GFX10-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v3, 0x400000, v0
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX10-NEXT: v_add3_u32 v2, v2, v0, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX10-NEXT: v_and_or_b32 v0, v1, s10, v0
+; GFX10-NEXT: v_mov_b32_e32 v3, v1
+; GFX10-NEXT: v_mov_b32_e32 v2, v0
+; GFX10-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[4:7], 0 offen glc
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: buffer_gl1_inv
+; GFX10-NEXT: buffer_gl0_inv
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
+; GFX10-NEXT: v_mov_b32_e32 v1, v2
+; GFX10-NEXT: s_or_b32 s9, vcc_lo, s9
+; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s9
+; GFX10-NEXT: s_cbranch_execnz .LBB16_1
+; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s9
+; GFX10-NEXT: v_lshrrev_b32_e32 v0, s8, v2
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine_grained_memory:
+; GFX90A: ; %bb.0:
+; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: s_addk_i32 s8, 0x200
+; GFX90A-NEXT: s_and_b32 s9, s8, -4
+; GFX90A-NEXT: v_mov_b32_e32 v4, s9
+; GFX90A-NEXT: buffer_load_dword v1, v4, s[4:7], 0 offen
+; GFX90A-NEXT: s_and_b32 s8, s8, 3
+; GFX90A-NEXT: s_lshl_b32 s10, s8, 3
+; GFX90A-NEXT: s_lshl_b32 s8, 0xffff, s10
+; GFX90A-NEXT: s_not_b32 s11, s8
+; GFX90A-NEXT: s_mov_b64 s[8:9], 0
+; GFX90A-NEXT: v_lshlrev_b32_e32 v5, 16, v0
+; GFX90A-NEXT: s_movk_i32 s12, 0x7fff
+; GFX90A-NEXT: .LBB16_1: ; %atomicrmw.start
+; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: v_lshrrev_b32_sdwa v0, s10, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX90A-NEXT: v_add_f32_e32 v0, v0, v5
+; GFX90A-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v3, 0x400000, v0
+; GFX90A-NEXT: v_add3_u32 v2, v2, v0, s12
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
+; GFX90A-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
+; GFX90A-NEXT: v_lshlrev_b32_sdwa v0, s10, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX90A-NEXT: v_and_or_b32 v0, v1, s11, v0
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
+; GFX90A-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[4:7], 0 offen glc
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: buffer_wbinvl1
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX90A-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; GFX90A-NEXT: v_mov_b32_e32 v1, v2
+; GFX90A-NEXT: s_andn2_b64 exec, exec, s[8:9]
+; GFX90A-NEXT: s_cbranch_execnz .LBB16_1
+; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX90A-NEXT: s_or_b64 exec, exec, s[8:9]
+; GFX90A-NEXT: v_lshrrev_b32_e32 v0, s10, v2
+; GFX90A-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine_grained_memory:
+; GFX908: ; %bb.0:
+; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT: s_addk_i32 s8, 0x200
+; GFX908-NEXT: s_and_b32 s9, s8, -4
+; GFX908-NEXT: v_mov_b32_e32 v4, s9
+; GFX908-NEXT: buffer_load_dword v1, v4, s[4:7], 0 offen
+; GFX908-NEXT: s_and_b32 s8, s8, 3
+; GFX908-NEXT: s_lshl_b32 s10, s8, 3
+; GFX908-NEXT: s_lshl_b32 s8, 0xffff, s10
+; GFX908-NEXT: s_not_b32 s11, s8
+; GFX908-NEXT: s_mov_b64 s[8:9], 0
+; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v0
+; GFX908-NEXT: s_movk_i32 s12, 0x7fff
+; GFX908-NEXT: .LBB16_1: ; %atomicrmw.start
+; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: v_lshrrev_b32_sdwa v0, s10, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX908-NEXT: v_add_f32_e32 v0, v0, v5
+; GFX908-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX908-NEXT: v_or_b32_e32 v3, 0x400000, v0
+; GFX908-NEXT: v_add3_u32 v2, v2, v0, s12
+; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
+; GFX908-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
+; GFX908-NEXT: v_lshlrev_b32_sdwa v0, s10, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX908-NEXT: v_and_or_b32 v0, v1, s11, v0
+; GFX908-NEXT: v_mov_b32_e32 v3, v1
+; GFX908-NEXT: v_mov_b32_e32 v2, v0
+; GFX908-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[4:7], 0 offen glc
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: buffer_wbinvl1
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX908-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; GFX908-NEXT: v_mov_b32_e32 v1, v2
+; GFX908-NEXT: s_andn2_b64 exec, exec, s[8:9]
+; GFX908-NEXT: s_cbranch_execnz .LBB16_1
+; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT: s_or_b64 exec, exec, s[8:9]
+; GFX908-NEXT: v_lshrrev_b32_e32 v0, s10, v2
+; GFX908-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine_grained_memory:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: s_addk_i32 s8, 0x200
+; GFX8-NEXT: s_and_b32 s9, s8, -4
+; GFX8-NEXT: v_mov_b32_e32 v4, s9
+; GFX8-NEXT: buffer_load_dword v1, v4, s[4:7], 0 offen
+; GFX8-NEXT: s_and_b32 s8, s8, 3
+; GFX8-NEXT: s_lshl_b32 s10, s8, 3
+; GFX8-NEXT: s_lshl_b32 s8, 0xffff, s10
+; GFX8-NEXT: s_not_b32 s11, s8
+; GFX8-NEXT: s_mov_b64 s[8:9], 0
+; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v0
+; GFX8-NEXT: .LBB16_1: ; %atomicrmw.start
+; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT: v_mov_b32_e32 v0, s10
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_lshrrev_b32_sdwa v3, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_add_f32_e32 v3, v3, v5
+; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3
+; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6
+; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; GFX8-NEXT: v_cndmask_b32_e32 v3, v6, v7, vcc
+; GFX8-NEXT: v_and_b32_e32 v2, s11, v1
+; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT: v_or_b32_e32 v0, v2, v0
+; GFX8-NEXT: v_mov_b32_e32 v3, v1
+; GFX8-NEXT: v_mov_b32_e32 v2, v0
+; GFX8-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[4:7], 0 offen glc
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: buffer_wbinvl1
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX8-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; GFX8-NEXT: v_mov_b32_e32 v1, v2
+; GFX8-NEXT: s_andn2_b64 exec, exec, s[8:9]
+; GFX8-NEXT: s_cbranch_execnz .LBB16_1
+; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT: s_or_b64 exec, exec, s[8:9]
+; GFX8-NEXT: v_lshrrev_b32_e32 v0, s10, v2
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine_grained_memory:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: s_addk_i32 s8, 0x200
+; GFX7-NEXT: s_and_b32 s9, s8, -4
+; GFX7-NEXT: v_mov_b32_e32 v4, s9
+; GFX7-NEXT: buffer_load_dword v1, v4, s[4:7], 0 offen
+; GFX7-NEXT: s_and_b32 s8, s8, 3
+; GFX7-NEXT: s_lshl_b32 s10, s8, 3
+; GFX7-NEXT: s_lshl_b32 s8, 0xffff, s10
+; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT: s_not_b32 s11, s8
+; GFX7-NEXT: s_mov_b64 s[8:9], 0
+; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v0
+; GFX7-NEXT: .LBB16_1: ; %atomicrmw.start
+; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, s10, v1
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_add_f32_e32 v0, v0, v5
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_and_b32_e32 v2, s11, v1
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, s10, v0
+; GFX7-NEXT: v_or_b32_e32 v0, v2, v0
+; GFX7-NEXT: v_mov_b32_e32 v3, v1
+; GFX7-NEXT: v_mov_b32_e32 v2, v0
+; GFX7-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[4:7], 0 offen glc
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: buffer_wbinvl1
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT: v_mov_b32_e32 v1, v2
+; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
+; GFX7-NEXT: s_cbranch_execnz .LBB16_1
+; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, s10, v2
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine_grained_memory:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT: s_addk_i32 s8, 0x200
+; GFX6-NEXT: s_and_b32 s9, s8, -4
+; GFX6-NEXT: v_mov_b32_e32 v4, s9
+; GFX6-NEXT: buffer_load_dword v1, v4, s[4:7], 0 offen
+; GFX6-NEXT: s_and_b32 s8, s8, 3
+; GFX6-NEXT: s_lshl_b32 s10, s8, 3
+; GFX6-NEXT: s_lshl_b32 s8, 0xffff, s10
+; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GFX6-NEXT: s_not_b32 s11, s8
+; GFX6-NEXT: s_mov_b64 s[8:9], 0
+; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v0
+; GFX6-NEXT: .LBB16_1: ; %atomicrmw.start
+; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_lshrrev_b32_e32 v0, s10, v1
+; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX6-NEXT: v_add_f32_e32 v0, v0, v5
+; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_and_b32_e32 v2, s11, v1
+; GFX6-NEXT: v_lshlrev_b32_e32 v0, s10, v0
+; GFX6-NEXT: v_or_b32_e32 v0, v2, v0
+; GFX6-NEXT: v_mov_b32_e32 v3, v1
+; GFX6-NEXT: v_mov_b32_e32 v2, v0
+; GFX6-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[4:7], 0 offen glc
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: buffer_wbinvl1
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT: v_mov_b32_e32 v1, v2
+; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
+; GFX6-NEXT: s_cbranch_execnz .LBB16_1
+; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
+; GFX6-NEXT: v_lshrrev_b32_e32 v0, s10, v2
+; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: s_setpc_b64 s[30:31]
+ %gep = getelementptr bfloat, ptr addrspace(7) %ptr, i32 256
+ %result = atomicrmw fadd ptr addrspace(7) %gep, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+ ret bfloat %result
+}
+
+define void @buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, bfloat %val) #0 {
+; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine_grained_memory:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: s_addk_co_i32 s4, 0x200
+; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v0
+; GFX12-NEXT: s_and_b32 s5, s4, -4
+; GFX12-NEXT: s_and_b32 s4, s4, 3
+; GFX12-NEXT: v_mov_b32_e32 v2, s5
+; GFX12-NEXT: s_lshl_b32 s4, s4, 3
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_lshl_b32 s5, 0xffff, s4
+; GFX12-NEXT: s_not_b32 s6, s5
+; GFX12-NEXT: buffer_load_b32 v1, v2, s[0:3], null offen
+; GFX12-NEXT: s_mov_b32 s5, 0
+; GFX12-NEXT: .LBB17_1: ; %atomicrmw.start
+; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v1
+; GFX12-NEXT: s_wait_storecnt 0x0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX12-NEXT: v_add_f32_e32 v0, v0, v3
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-NEXT: v_bfe_u32 v4, v0, 16, 1
+; GFX12-NEXT: v_or_b32_e32 v5, 0x400000, v0
+; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX12-NEXT: v_add3_u32 v4, v4, v0, 0x7fff
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc_lo
+; GFX12-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_lshlrev_b32_e32 v0, s4, v0
+; GFX12-NEXT: v_and_or_b32 v0, v1, s6, v0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
+; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1
+; GFX12-NEXT: v_mov_b32_e32 v1, v4
+; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
+; GFX12-NEXT: s_cbranch_execnz .LBB17_1
+; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine_grained_memory:
+; GFX940: ; %bb.0:
+; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT: s_addk_i32 s4, 0x200
+; GFX940-NEXT: s_and_b32 s5, s4, -4
+; GFX940-NEXT: v_mov_b32_e32 v2, s5
+; GFX940-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen
+; GFX940-NEXT: s_and_b32 s4, s4, 3
+; GFX940-NEXT: s_lshl_b32 s6, s4, 3
+; GFX940-NEXT: s_lshl_b32 s4, 0xffff, s6
+; GFX940-NEXT: s_not_b32 s7, s4
+; GFX940-NEXT: s_mov_b64 s[4:5], 0
+; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v0
+; GFX940-NEXT: s_movk_i32 s8, 0x7fff
+; GFX940-NEXT: .LBB17_1: ; %atomicrmw.start
+; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX940-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NEXT: v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX940-NEXT: buffer_wbl2 sc1
+; GFX940-NEXT: v_add_f32_e32 v0, v0, v3
+; GFX940-NEXT: v_bfe_u32 v4, v0, 16, 1
+; GFX940-NEXT: v_or_b32_e32 v5, 0x400000, v0
+; GFX940-NEXT: v_add3_u32 v4, v4, v0, s8
+; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
+; GFX940-NEXT: s_nop 1
+; GFX940-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc
+; GFX940-NEXT: v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX940-NEXT: v_and_or_b32 v0, v1, s7, v0
+; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[0:1]
+; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[0:3], 0 offen sc0
+; GFX940-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NEXT: buffer_inv sc1
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1
+; GFX940-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX940-NEXT: v_mov_b32_e32 v1, v4
+; GFX940-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX940-NEXT: s_cbranch_execnz .LBB17_1
+; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX940-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX940-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine_grained_memory:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_addk_i32 s4, 0x200
+; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v0
+; GFX11-NEXT: s_and_b32 s5, s4, -4
+; GFX11-NEXT: s_and_b32 s4, s4, 3
+; GFX11-NEXT: v_mov_b32_e32 v2, s5
+; GFX11-NEXT: s_lshl_b32 s4, s4, 3
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT: s_lshl_b32 s5, 0xffff, s4
+; GFX11-NEXT: s_not_b32 s6, s5
+; GFX11-NEXT: buffer_load_b32 v1, v2, s[0:3], 0 offen
+; GFX11-NEXT: s_mov_b32 s5, 0
+; GFX11-NEXT: .p2align 6
+; GFX11-NEXT: .LBB17_1: ; %atomicrmw.start
+; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_lshrrev_b32_e32 v0, s4, v1
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX11-NEXT: v_add_f32_e32 v0, v0, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_bfe_u32 v4, v0, 16, 1
+; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v0
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-NEXT: v_add3_u32 v4, v4, v0, 0x7fff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc_lo
+; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_lshlrev_b32_e32 v0, s4, v0
+; GFX11-NEXT: v_and_or_b32 v0, v1, s6, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
+; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], 0 offen glc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: buffer_gl1_inv
+; GFX11-NEXT: buffer_gl0_inv
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1
+; GFX11-NEXT: v_mov_b32_e32 v1, v4
+; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
+; GFX11-NEXT: s_cbranch_execnz .LBB17_1
+; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s5
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine_grained_memory:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_addk_i32 s8, 0x200
+; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v0
+; GFX10-NEXT: s_and_b32 s9, s8, -4
+; GFX10-NEXT: s_and_b32 s8, s8, 3
+; GFX10-NEXT: v_mov_b32_e32 v2, s9
+; GFX10-NEXT: s_lshl_b32 s8, s8, 3
+; GFX10-NEXT: s_lshl_b32 s9, 0xffff, s8
+; GFX10-NEXT: s_not_b32 s10, s9
+; GFX10-NEXT: buffer_load_dword v1, v2, s[4:7], 0 offen
+; GFX10-NEXT: s_mov_b32 s9, 0
+; GFX10-NEXT: .LBB17_1: ; %atomicrmw.start
+; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_lshrrev_b32_sdwa v0, s8, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: v_add_f32_e32 v0, v0, v3
+; GFX10-NEXT: v_bfe_u32 v4, v0, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v0
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX10-NEXT: v_add3_u32 v4, v4, v0, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX10-NEXT: v_and_or_b32 v0, v1, s10, v0
+; GFX10-NEXT: v_mov_b32_e32 v5, v1
+; GFX10-NEXT: v_mov_b32_e32 v4, v0
+; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[4:7], 0 offen glc
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: buffer_gl1_inv
+; GFX10-NEXT: buffer_gl0_inv
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1
+; GFX10-NEXT: v_mov_b32_e32 v1, v4
+; GFX10-NEXT: s_or_b32 s9, vcc_lo, s9
+; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s9
+; GFX10-NEXT: s_cbranch_execnz .LBB17_1
+; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s9
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine_grained_memory:
+; GFX90A: ; %bb.0:
+; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: s_addk_i32 s8, 0x200
+; GFX90A-NEXT: s_and_b32 s9, s8, -4
+; GFX90A-NEXT: v_mov_b32_e32 v2, s9
+; GFX90A-NEXT: buffer_load_dword v1, v2, s[4:7], 0 offen
+; GFX90A-NEXT: s_and_b32 s8, s8, 3
+; GFX90A-NEXT: s_lshl_b32 s10, s8, 3
+; GFX90A-NEXT: s_lshl_b32 s8, 0xffff, s10
+; GFX90A-NEXT: s_not_b32 s11, s8
+; GFX90A-NEXT: s_mov_b64 s[8:9], 0
+; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v0
+; GFX90A-NEXT: s_movk_i32 s12, 0x7fff
+; GFX90A-NEXT: .LBB17_1: ; %atomicrmw.start
+; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: v_lshrrev_b32_sdwa v0, s10, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX90A-NEXT: v_add_f32_e32 v0, v0, v3
+; GFX90A-NEXT: v_bfe_u32 v4, v0, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v5, 0x400000, v0
+; GFX90A-NEXT: v_add3_u32 v4, v4, v0, s12
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
+; GFX90A-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc
+; GFX90A-NEXT: v_lshlrev_b32_sdwa v0, s10, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX90A-NEXT: v_and_or_b32 v0, v1, s11, v0
+; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[0:1], v[0:1] op_sel:[0,1]
+; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[4:7], 0 offen glc
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: buffer_wbinvl1
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1
+; GFX90A-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; GFX90A-NEXT: v_mov_b32_e32 v1, v4
+; GFX90A-NEXT: s_andn2_b64 exec, exec, s[8:9]
+; GFX90A-NEXT: s_cbranch_execnz .LBB17_1
+; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX90A-NEXT: s_or_b64 exec, exec, s[8:9]
+; GFX90A-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine_grained_memory:
+; GFX908: ; %bb.0:
+; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT: s_addk_i32 s8, 0x200
+; GFX908-NEXT: s_and_b32 s9, s8, -4
+; GFX908-NEXT: v_mov_b32_e32 v2, s9
+; GFX908-NEXT: buffer_load_dword v1, v2, s[4:7], 0 offen
+; GFX908-NEXT: s_and_b32 s8, s8, 3
+; GFX908-NEXT: s_lshl_b32 s10, s8, 3
+; GFX908-NEXT: s_lshl_b32 s8, 0xffff, s10
+; GFX908-NEXT: s_not_b32 s11, s8
+; GFX908-NEXT: s_mov_b64 s[8:9], 0
+; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v0
+; GFX908-NEXT: s_movk_i32 s12, 0x7fff
+; GFX908-NEXT: .LBB17_1: ; %atomicrmw.start
+; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: v_lshrrev_b32_sdwa v0, s10, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX908-NEXT: v_add_f32_e32 v0, v0, v3
+; GFX908-NEXT: v_bfe_u32 v4, v0, 16, 1
+; GFX908-NEXT: v_or_b32_e32 v5, 0x400000, v0
+; GFX908-NEXT: v_add3_u32 v4, v4, v0, s12
+; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
+; GFX908-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc
+; GFX908-NEXT: v_lshlrev_b32_sdwa v0, s10, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX908-NEXT: v_and_or_b32 v0, v1, s11, v0
+; GFX908-NEXT: v_mov_b32_e32 v5, v1
+; GFX908-NEXT: v_mov_b32_e32 v4, v0
+; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[4:7], 0 offen glc
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: buffer_wbinvl1
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1
+; GFX908-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; GFX908-NEXT: v_mov_b32_e32 v1, v4
+; GFX908-NEXT: s_andn2_b64 exec, exec, s[8:9]
+; GFX908-NEXT: s_cbranch_execnz .LBB17_1
+; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT: s_or_b64 exec, exec, s[8:9]
+; GFX908-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine_grained_memory:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: s_addk_i32 s8, 0x200
+; GFX8-NEXT: s_and_b32 s9, s8, -4
+; GFX8-NEXT: v_mov_b32_e32 v2, s9
+; GFX8-NEXT: buffer_load_dword v1, v2, s[4:7], 0 offen
+; GFX8-NEXT: s_and_b32 s8, s8, 3
+; GFX8-NEXT: s_lshl_b32 s10, s8, 3
+; GFX8-NEXT: s_lshl_b32 s8, 0xffff, s10
+; GFX8-NEXT: s_not_b32 s11, s8
+; GFX8-NEXT: s_mov_b64 s[8:9], 0
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v0
+; GFX8-NEXT: .LBB17_1: ; %atomicrmw.start
+; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT: v_mov_b32_e32 v0, s10
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_lshrrev_b32_sdwa v5, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_add_f32_e32 v5, v5, v3
+; GFX8-NEXT: v_bfe_u32 v6, v5, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v5
+; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6
+; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v5
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX8-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc
+; GFX8-NEXT: v_and_b32_e32 v4, s11, v1
+; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT: v_or_b32_e32 v0, v4, v0
+; GFX8-NEXT: v_mov_b32_e32 v5, v1
+; GFX8-NEXT: v_mov_b32_e32 v4, v0
+; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[4:7], 0 offen glc
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: buffer_wbinvl1
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1
+; GFX8-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; GFX8-NEXT: v_mov_b32_e32 v1, v4
+; GFX8-NEXT: s_andn2_b64 exec, exec, s[8:9]
+; GFX8-NEXT: s_cbranch_execnz .LBB17_1
+; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT: s_or_b64 exec, exec, s[8:9]
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine_grained_memory:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: s_addk_i32 s8, 0x200
+; GFX7-NEXT: s_and_b32 s9, s8, -4
+; GFX7-NEXT: v_mov_b32_e32 v2, s9
+; GFX7-NEXT: buffer_load_dword v1, v2, s[4:7], 0 offen
+; GFX7-NEXT: s_and_b32 s8, s8, 3
+; GFX7-NEXT: s_lshl_b32 s10, s8, 3
+; GFX7-NEXT: s_lshl_b32 s8, 0xffff, s10
+; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT: s_not_b32 s11, s8
+; GFX7-NEXT: s_mov_b64 s[8:9], 0
+; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
+; GFX7-NEXT: .LBB17_1: ; %atomicrmw.start
+; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, s10, v1
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_add_f32_e32 v0, v0, v3
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_and_b32_e32 v4, s11, v1
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, s10, v0
+; GFX7-NEXT: v_or_b32_e32 v0, v4, v0
+; GFX7-NEXT: v_mov_b32_e32 v5, v1
+; GFX7-NEXT: v_mov_b32_e32 v4, v0
+; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[4:7], 0 offen glc
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: buffer_wbinvl1
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1
+; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT: v_mov_b32_e32 v1, v4
+; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
+; GFX7-NEXT: s_cbranch_execnz .LBB17_1
+; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine_grained_memory:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT: s_addk_i32 s8, 0x200
+; GFX6-NEXT: s_and_b32 s9, s8, -4
+; GFX6-NEXT: v_mov_b32_e32 v2, s9
+; GFX6-NEXT: buffer_load_dword v1, v2, s[4:7], 0 offen
+; GFX6-NEXT: s_and_b32 s8, s8, 3
+; GFX6-NEXT: s_lshl_b32 s10, s8, 3
+; GFX6-NEXT: s_lshl_b32 s8, 0xffff, s10
+; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GFX6-NEXT: s_not_b32 s11, s8
+; GFX6-NEXT: s_mov_b64 s[8:9], 0
+; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
+; GFX6-NEXT: .LBB17_1: ; %atomicrmw.start
+; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_lshrrev_b32_e32 v0, s10, v1
+; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX6-NEXT: v_add_f32_e32 v0, v0, v3
+; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_and_b32_e32 v4, s11, v1
+; GFX6-NEXT: v_lshlrev_b32_e32 v0, s10, v0
+; GFX6-NEXT: v_or_b32_e32 v0, v4, v0
+; GFX6-NEXT: v_mov_b32_e32 v5, v1
+; GFX6-NEXT: v_mov_b32_e32 v4, v0
+; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[4:7], 0 offen glc
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: buffer_wbinvl1
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1
+; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT: v_mov_b32_e32 v1, v4
+; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
+; GFX6-NEXT: s_cbranch_execnz .LBB17_1
+; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: s_setpc_b64 s[30:31]
+ %gep = getelementptr bfloat, ptr addrspace(7) %ptr, i32 256
+ %unused = atomicrmw fadd ptr addrspace(7) %gep, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+ ret void
+}
+
+define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory(ptr addrspace(7) %ptr, bfloat %val) #0 {
+; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_add_nc_u32_e32 v4, 0x200, v4
+; GFX12-NEXT: s_mov_b32 s1, exec_lo
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-NEXT: v_and_b32_e32 v6, 3, v4
+; GFX12-NEXT: v_and_b32_e32 v8, -4, v4
+; GFX12-NEXT: v_lshlrev_b32_e32 v7, 3, v6
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_lshlrev_b32_e64 v6, v7, 0xffff
+; GFX12-NEXT: v_not_b32_e32 v9, v6
+; GFX12-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT: v_readfirstlane_b32 s4, v0
+; GFX12-NEXT: v_readfirstlane_b32 s5, v1
+; GFX12-NEXT: v_readfirstlane_b32 s6, v2
+; GFX12-NEXT: v_readfirstlane_b32 s7, v3
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
+; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX12-NEXT: s_and_saveexec_b32 s0, s0
+; GFX12-NEXT: buffer_load_b32 v6, v8, s[4:7], null offen
+; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_cbranch_execnz .LBB18_1
+; GFX12-NEXT: ; %bb.2:
+; GFX12-NEXT: s_mov_b32 exec_lo, s1
+; GFX12-NEXT: v_lshlrev_b32_e32 v10, 16, v5
+; GFX12-NEXT: s_mov_b32 s1, 0
+; GFX12-NEXT: .LBB18_3: ; %atomicrmw.start
+; GFX12-NEXT: ; =>This Loop Header: Depth=1
+; GFX12-NEXT: ; Child Loop BB18_4 Depth 2
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: v_lshrrev_b32_e32 v4, v7, v6
+; GFX12-NEXT: s_mov_b32 s2, exec_lo
+; GFX12-NEXT: s_wait_storecnt 0x0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX12-NEXT: v_add_f32_e32 v4, v4, v10
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-NEXT: v_bfe_u32 v5, v4, 16, 1
+; GFX12-NEXT: v_or_b32_e32 v11, 0x400000, v4
+; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX12-NEXT: v_add3_u32 v5, v5, v4, 0x7fff
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_cndmask_b32_e32 v4, v5, v11, vcc_lo
+; GFX12-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_lshlrev_b32_e32 v4, v7, v4
+; GFX12-NEXT: v_and_or_b32 v5, v6, v9, v4
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_mov_b32_e32 v4, v5
+; GFX12-NEXT: v_mov_b32_e32 v5, v6
+; GFX12-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1
+; GFX12-NEXT: ; => This Inner Loop Header: Depth=2
+; GFX12-NEXT: v_readfirstlane_b32 s4, v0
+; GFX12-NEXT: v_readfirstlane_b32 s5, v1
+; GFX12-NEXT: v_readfirstlane_b32 s6, v2
+; GFX12-NEXT: v_readfirstlane_b32 s7, v3
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
+; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX12-NEXT: s_and_saveexec_b32 s0, s0
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v8, s[4:7], null offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_cbranch_execnz .LBB18_4
+; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1
+; GFX12-NEXT: s_mov_b32 exec_lo, s2
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6
+; GFX12-NEXT: v_mov_b32_e32 v6, v4
+; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX12-NEXT: s_cbranch_execnz .LBB18_3
+; GFX12-NEXT: ; %bb.6: ; %atomicrmw.end
+; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX12-NEXT: v_lshrrev_b32_e32 v0, v7, v4
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
+; GFX940: ; %bb.0:
+; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT: v_add_u32_e32 v4, 0x200, v4
+; GFX940-NEXT: v_and_b32_e32 v9, -4, v4
+; GFX940-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX940-NEXT: v_lshlrev_b32_e32 v8, 3, v4
+; GFX940-NEXT: s_mov_b32 s0, 0xffff
+; GFX940-NEXT: v_lshlrev_b32_e64 v4, v8, s0
+; GFX940-NEXT: v_not_b32_e32 v10, v4
+; GFX940-NEXT: s_mov_b64 s[2:3], exec
+; GFX940-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1
+; GFX940-NEXT: v_readfirstlane_b32 s4, v0
+; GFX940-NEXT: v_readfirstlane_b32 s5, v1
+; GFX940-NEXT: v_readfirstlane_b32 s6, v2
+; GFX940-NEXT: v_readfirstlane_b32 s7, v3
+; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1]
+; GFX940-NEXT: s_nop 0
+; GFX940-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
+; GFX940-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
+; GFX940-NEXT: s_and_saveexec_b64 s[0:1], s[0:1]
+; GFX940-NEXT: buffer_load_dword v7, v9, s[4:7], 0 offen
+; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1]
+; GFX940-NEXT: s_cbranch_execnz .LBB18_1
+; GFX940-NEXT: ; %bb.2:
+; GFX940-NEXT: s_mov_b64 exec, s[2:3]
+; GFX940-NEXT: s_mov_b64 s[2:3], 0
+; GFX940-NEXT: v_lshlrev_b32_e32 v11, 16, v5
+; GFX940-NEXT: s_movk_i32 s10, 0x7fff
+; GFX940-NEXT: .LBB18_3: ; %atomicrmw.start
+; GFX940-NEXT: ; =>This Loop Header: Depth=1
+; GFX940-NEXT: ; Child Loop BB18_4 Depth 2
+; GFX940-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NEXT: v_lshrrev_b32_sdwa v4, v8, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX940-NEXT: s_mov_b64 s[8:9], exec
+; GFX940-NEXT: v_add_f32_e32 v4, v4, v11
+; GFX940-NEXT: v_bfe_u32 v5, v4, 16, 1
+; GFX940-NEXT: v_add3_u32 v5, v5, v4, s10
+; GFX940-NEXT: v_or_b32_e32 v6, 0x400000, v4
+; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
+; GFX940-NEXT: buffer_wbl2 sc1
+; GFX940-NEXT: s_nop 0
+; GFX940-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc
+; GFX940-NEXT: v_lshlrev_b32_sdwa v4, v8, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX940-NEXT: v_and_or_b32 v6, v7, v10, v4
+; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[6:7]
+; GFX940-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1
+; GFX940-NEXT: ; => This Inner Loop Header: Depth=2
+; GFX940-NEXT: v_readfirstlane_b32 s4, v0
+; GFX940-NEXT: v_readfirstlane_b32 s5, v1
+; GFX940-NEXT: v_readfirstlane_b32 s6, v2
+; GFX940-NEXT: v_readfirstlane_b32 s7, v3
+; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1]
+; GFX940-NEXT: s_nop 0
+; GFX940-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
+; GFX940-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
+; GFX940-NEXT: s_and_saveexec_b64 s[0:1], s[0:1]
+; GFX940-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v9, s[4:7], 0 offen sc0
+; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1]
+; GFX940-NEXT: s_cbranch_execnz .LBB18_4
+; GFX940-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1
+; GFX940-NEXT: s_mov_b64 exec, s[8:9]
+; GFX940-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7
+; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX940-NEXT: v_mov_b32_e32 v7, v4
+; GFX940-NEXT: buffer_inv sc1
+; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX940-NEXT: s_cbranch_execnz .LBB18_3
+; GFX940-NEXT: ; %bb.6: ; %atomicrmw.end
+; GFX940-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX940-NEXT: v_lshrrev_b32_e32 v0, v8, v4
+; GFX940-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x200, v4
+; GFX11-NEXT: s_mov_b32 s1, 0
+; GFX11-NEXT: s_mov_b32 s2, exec_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_and_b32_e32 v6, 3, v4
+; GFX11-NEXT: v_and_b32_e32 v8, -4, v4
+; GFX11-NEXT: v_lshlrev_b32_e32 v7, 3, v6
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_lshlrev_b32_e64 v6, v7, 0xffff
+; GFX11-NEXT: v_not_b32_e32 v9, v6
+; GFX11-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT: v_readfirstlane_b32 s4, v0
+; GFX11-NEXT: v_readfirstlane_b32 s5, v1
+; GFX11-NEXT: v_readfirstlane_b32 s6, v2
+; GFX11-NEXT: v_readfirstlane_b32 s7, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
+; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX11-NEXT: s_and_saveexec_b32 s0, s0
+; GFX11-NEXT: buffer_load_b32 v6, v8, s[4:7], 0 offen
+; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT: s_cbranch_execnz .LBB18_1
+; GFX11-NEXT: ; %bb.2:
+; GFX11-NEXT: s_mov_b32 exec_lo, s2
+; GFX11-NEXT: v_lshlrev_b32_e32 v10, 16, v5
+; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX11-NEXT: .p2align 6
+; GFX11-NEXT: .LBB18_3: ; %atomicrmw.start
+; GFX11-NEXT: ; =>This Loop Header: Depth=1
+; GFX11-NEXT: ; Child Loop BB18_4 Depth 2
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_lshrrev_b32_e32 v4, v7, v6
+; GFX11-NEXT: s_mov_b32 s2, exec_lo
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX11-NEXT: v_add_f32_e32 v4, v4, v10
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_bfe_u32 v5, v4, 16, 1
+; GFX11-NEXT: v_or_b32_e32 v11, 0x400000, v4
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-NEXT: v_add3_u32 v5, v5, v4, 0x7fff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_cndmask_b32_e32 v4, v5, v11, vcc_lo
+; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_lshlrev_b32_e32 v4, v7, v4
+; GFX11-NEXT: v_and_or_b32 v5, v6, v9, v4
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_mov_b32_e32 v4, v5
+; GFX11-NEXT: v_mov_b32_e32 v5, v6
+; GFX11-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1
+; GFX11-NEXT: ; => This Inner Loop Header: Depth=2
+; GFX11-NEXT: v_readfirstlane_b32 s4, v0
+; GFX11-NEXT: v_readfirstlane_b32 s5, v1
+; GFX11-NEXT: v_readfirstlane_b32 s6, v2
+; GFX11-NEXT: v_readfirstlane_b32 s7, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
+; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX11-NEXT: s_and_saveexec_b32 s0, s0
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v8, s[4:7], 0 offen glc
+; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT: s_cbranch_execnz .LBB18_4
+; GFX11-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1
+; GFX11-NEXT: s_mov_b32 exec_lo, s2
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6
+; GFX11-NEXT: v_mov_b32_e32 v6, v4
+; GFX11-NEXT: buffer_gl1_inv
+; GFX11-NEXT: buffer_gl0_inv
+; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX11-NEXT: s_cbranch_execnz .LBB18_3
+; GFX11-NEXT: ; %bb.6: ; %atomicrmw.end
+; GFX11-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX11-NEXT: v_lshrrev_b32_e32 v0, v7, v4
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_add_nc_u32_e32 v4, 0x200, v4
+; GFX10-NEXT: s_mov_b32 s5, 0
+; GFX10-NEXT: s_mov_b32 s6, exec_lo
+; GFX10-NEXT: v_and_b32_e32 v6, 3, v4
+; GFX10-NEXT: v_and_b32_e32 v8, -4, v4
+; GFX10-NEXT: v_lshlrev_b32_e32 v7, 3, v6
+; GFX10-NEXT: v_lshlrev_b32_e64 v6, v7, 0xffff
+; GFX10-NEXT: v_not_b32_e32 v9, v6
+; GFX10-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT: v_readfirstlane_b32 s8, v0
+; GFX10-NEXT: v_readfirstlane_b32 s9, v1
+; GFX10-NEXT: v_readfirstlane_b32 s10, v2
+; GFX10-NEXT: v_readfirstlane_b32 s11, v3
+; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[8:9], v[0:1]
+; GFX10-NEXT: v_cmp_eq_u64_e64 s4, s[10:11], v[2:3]
+; GFX10-NEXT: s_and_b32 s4, vcc_lo, s4
+; GFX10-NEXT: s_and_saveexec_b32 s4, s4
+; GFX10-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen
+; GFX10-NEXT: s_waitcnt_depctr 0xffe3
+; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT: s_cbranch_execnz .LBB18_1
+; GFX10-NEXT: ; %bb.2:
+; GFX10-NEXT: s_mov_b32 exec_lo, s6
+; GFX10-NEXT: v_lshlrev_b32_e32 v10, 16, v5
+; GFX10-NEXT: .LBB18_3: ; %atomicrmw.start
+; GFX10-NEXT: ; =>This Loop Header: Depth=1
+; GFX10-NEXT: ; Child Loop BB18_4 Depth 2
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_lshrrev_b32_sdwa v4, v7, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX10-NEXT: s_mov_b32 s6, exec_lo
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: v_add_f32_e32 v4, v4, v10
+; GFX10-NEXT: v_bfe_u32 v5, v4, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v11, 0x400000, v4
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX10-NEXT: v_add3_u32 v5, v5, v4, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v4, v5, v11, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_sdwa v4, v7, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX10-NEXT: v_and_or_b32 v5, v6, v9, v4
+; GFX10-NEXT: v_mov_b32_e32 v4, v5
+; GFX10-NEXT: v_mov_b32_e32 v5, v6
+; GFX10-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1
+; GFX10-NEXT: ; => This Inner Loop Header: Depth=2
+; GFX10-NEXT: v_readfirstlane_b32 s8, v0
+; GFX10-NEXT: v_readfirstlane_b32 s9, v1
+; GFX10-NEXT: v_readfirstlane_b32 s10, v2
+; GFX10-NEXT: v_readfirstlane_b32 s11, v3
+; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[8:9], v[0:1]
+; GFX10-NEXT: v_cmp_eq_u64_e64 s4, s[10:11], v[2:3]
+; GFX10-NEXT: s_and_b32 s4, vcc_lo, s4
+; GFX10-NEXT: s_and_saveexec_b32 s4, s4
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc
+; GFX10-NEXT: s_waitcnt_depctr 0xffe3
+; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT: s_cbranch_execnz .LBB18_4
+; GFX10-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1
+; GFX10-NEXT: s_mov_b32 exec_lo, s6
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6
+; GFX10-NEXT: v_mov_b32_e32 v6, v4
+; GFX10-NEXT: buffer_gl1_inv
+; GFX10-NEXT: buffer_gl0_inv
+; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5
+; GFX10-NEXT: s_waitcnt_depctr 0xffe3
+; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5
+; GFX10-NEXT: s_cbranch_execnz .LBB18_3
+; GFX10-NEXT: ; %bb.6: ; %atomicrmw.end
+; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5
+; GFX10-NEXT: v_lshrrev_b32_e32 v0, v7, v4
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
+; GFX90A: ; %bb.0:
+; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_add_u32_e32 v4, 0x200, v4
+; GFX90A-NEXT: v_and_b32_e32 v9, -4, v4
+; GFX90A-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX90A-NEXT: v_lshlrev_b32_e32 v8, 3, v4
+; GFX90A-NEXT: s_mov_b32 s4, 0xffff
+; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v8, s4
+; GFX90A-NEXT: v_not_b32_e32 v10, v4
+; GFX90A-NEXT: s_mov_b64 s[6:7], exec
+; GFX90A-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT: v_readfirstlane_b32 s8, v0
+; GFX90A-NEXT: v_readfirstlane_b32 s9, v1
+; GFX90A-NEXT: v_readfirstlane_b32 s10, v2
+; GFX90A-NEXT: v_readfirstlane_b32 s11, v3
+; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
+; GFX90A-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
+; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
+; GFX90A-NEXT: s_nop 0
+; GFX90A-NEXT: buffer_load_dword v7, v9, s[8:11], 0 offen
+; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: s_cbranch_execnz .LBB18_1
+; GFX90A-NEXT: ; %bb.2:
+; GFX90A-NEXT: s_mov_b64 exec, s[6:7]
+; GFX90A-NEXT: s_mov_b64 s[6:7], 0
+; GFX90A-NEXT: v_lshlrev_b32_e32 v11, 16, v5
+; GFX90A-NEXT: s_movk_i32 s14, 0x7fff
+; GFX90A-NEXT: .LBB18_3: ; %atomicrmw.start
+; GFX90A-NEXT: ; =>This Loop Header: Depth=1
+; GFX90A-NEXT: ; Child Loop BB18_4 Depth 2
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: v_lshrrev_b32_sdwa v4, v8, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX90A-NEXT: v_add_f32_e32 v4, v4, v11
+; GFX90A-NEXT: v_bfe_u32 v5, v4, 16, 1
+; GFX90A-NEXT: v_add3_u32 v5, v5, v4, s14
+; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v4
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
+; GFX90A-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc
+; GFX90A-NEXT: v_lshlrev_b32_sdwa v4, v8, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX90A-NEXT: v_and_or_b32 v6, v7, v10, v4
+; GFX90A-NEXT: s_mov_b64 s[12:13], exec
+; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1]
+; GFX90A-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1
+; GFX90A-NEXT: ; => This Inner Loop Header: Depth=2
+; GFX90A-NEXT: v_readfirstlane_b32 s8, v0
+; GFX90A-NEXT: v_readfirstlane_b32 s9, v1
+; GFX90A-NEXT: v_readfirstlane_b32 s10, v2
+; GFX90A-NEXT: v_readfirstlane_b32 s11, v3
+; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
+; GFX90A-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
+; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v9, s[8:11], 0 offen glc
+; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: s_cbranch_execnz .LBB18_4
+; GFX90A-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1
+; GFX90A-NEXT: s_mov_b64 exec, s[12:13]
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7
+; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GFX90A-NEXT: v_mov_b32_e32 v7, v4
+; GFX90A-NEXT: buffer_wbinvl1
+; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
+; GFX90A-NEXT: s_cbranch_execnz .LBB18_3
+; GFX90A-NEXT: ; %bb.6: ; %atomicrmw.end
+; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v8, v4
+; GFX90A-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
+; GFX908: ; %bb.0:
+; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT: v_add_u32_e32 v4, 0x200, v4
+; GFX908-NEXT: v_and_b32_e32 v8, -4, v4
+; GFX908-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX908-NEXT: v_lshlrev_b32_e32 v7, 3, v4
+; GFX908-NEXT: s_mov_b32 s4, 0xffff
+; GFX908-NEXT: v_lshlrev_b32_e64 v4, v7, s4
+; GFX908-NEXT: v_not_b32_e32 v9, v4
+; GFX908-NEXT: s_mov_b64 s[6:7], exec
+; GFX908-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT: v_readfirstlane_b32 s8, v0
+; GFX908-NEXT: v_readfirstlane_b32 s9, v1
+; GFX908-NEXT: v_readfirstlane_b32 s10, v2
+; GFX908-NEXT: v_readfirstlane_b32 s11, v3
+; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
+; GFX908-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
+; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
+; GFX908-NEXT: s_nop 0
+; GFX908-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen
+; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5]
+; GFX908-NEXT: s_cbranch_execnz .LBB18_1
+; GFX908-NEXT: ; %bb.2:
+; GFX908-NEXT: s_mov_b64 exec, s[6:7]
+; GFX908-NEXT: s_mov_b64 s[6:7], 0
+; GFX908-NEXT: v_lshlrev_b32_e32 v10, 16, v5
+; GFX908-NEXT: s_movk_i32 s14, 0x7fff
+; GFX908-NEXT: .LBB18_3: ; %atomicrmw.start
+; GFX908-NEXT: ; =>This Loop Header: Depth=1
+; GFX908-NEXT: ; Child Loop BB18_4 Depth 2
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: v_lshrrev_b32_sdwa v4, v7, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX908-NEXT: v_add_f32_e32 v4, v4, v10
+; GFX908-NEXT: v_bfe_u32 v5, v4, 16, 1
+; GFX908-NEXT: v_add3_u32 v5, v5, v4, s14
+; GFX908-NEXT: v_or_b32_e32 v11, 0x400000, v4
+; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
+; GFX908-NEXT: v_cndmask_b32_e32 v4, v5, v11, vcc
+; GFX908-NEXT: v_lshlrev_b32_sdwa v4, v7, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX908-NEXT: v_and_or_b32 v5, v6, v9, v4
+; GFX908-NEXT: v_mov_b32_e32 v4, v5
+; GFX908-NEXT: s_mov_b64 s[12:13], exec
+; GFX908-NEXT: v_mov_b32_e32 v5, v6
+; GFX908-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1
+; GFX908-NEXT: ; => This Inner Loop Header: Depth=2
+; GFX908-NEXT: v_readfirstlane_b32 s8, v0
+; GFX908-NEXT: v_readfirstlane_b32 s9, v1
+; GFX908-NEXT: v_readfirstlane_b32 s10, v2
+; GFX908-NEXT: v_readfirstlane_b32 s11, v3
+; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
+; GFX908-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
+; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc
+; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5]
+; GFX908-NEXT: s_cbranch_execnz .LBB18_4
+; GFX908-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1
+; GFX908-NEXT: s_mov_b64 exec, s[12:13]
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6
+; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GFX908-NEXT: v_mov_b32_e32 v6, v4
+; GFX908-NEXT: buffer_wbinvl1
+; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7]
+; GFX908-NEXT: s_cbranch_execnz .LBB18_3
+; GFX908-NEXT: ; %bb.6: ; %atomicrmw.end
+; GFX908-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX908-NEXT: v_lshrrev_b32_e32 v0, v7, v4
+; GFX908-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x200, v4
+; GFX8-NEXT: v_and_b32_e32 v8, -4, v4
+; GFX8-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX8-NEXT: v_lshlrev_b32_e32 v7, 3, v4
+; GFX8-NEXT: s_mov_b32 s4, 0xffff
+; GFX8-NEXT: v_lshlrev_b32_e64 v4, v7, s4
+; GFX8-NEXT: v_not_b32_e32 v9, v4
+; GFX8-NEXT: s_mov_b64 s[6:7], exec
+; GFX8-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT: v_readfirstlane_b32 s8, v0
+; GFX8-NEXT: v_readfirstlane_b32 s9, v1
+; GFX8-NEXT: v_readfirstlane_b32 s10, v2
+; GFX8-NEXT: v_readfirstlane_b32 s11, v3
+; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
+; GFX8-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
+; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
+; GFX8-NEXT: s_nop 0
+; GFX8-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen
+; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5]
+; GFX8-NEXT: s_cbranch_execnz .LBB18_1
+; GFX8-NEXT: ; %bb.2:
+; GFX8-NEXT: s_mov_b64 exec, s[6:7]
+; GFX8-NEXT: s_mov_b64 s[6:7], 0
+; GFX8-NEXT: v_lshlrev_b32_e32 v10, 16, v5
+; GFX8-NEXT: .LBB18_3: ; %atomicrmw.start
+; GFX8-NEXT: ; =>This Loop Header: Depth=1
+; GFX8-NEXT: ; Child Loop BB18_4 Depth 2
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_lshrrev_b32_sdwa v4, v7, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_add_f32_e32 v4, v4, v10
+; GFX8-NEXT: v_bfe_u32 v5, v4, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v4
+; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5
+; GFX8-NEXT: v_or_b32_e32 v11, 0x400000, v4
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
+; GFX8-NEXT: v_cndmask_b32_e32 v4, v5, v11, vcc
+; GFX8-NEXT: v_lshlrev_b32_sdwa v4, v7, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT: v_and_b32_e32 v5, v6, v9
+; GFX8-NEXT: v_or_b32_e32 v5, v5, v4
+; GFX8-NEXT: v_mov_b32_e32 v4, v5
+; GFX8-NEXT: s_mov_b64 s[12:13], exec
+; GFX8-NEXT: v_mov_b32_e32 v5, v6
+; GFX8-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1
+; GFX8-NEXT: ; => This Inner Loop Header: Depth=2
+; GFX8-NEXT: v_readfirstlane_b32 s8, v0
+; GFX8-NEXT: v_readfirstlane_b32 s9, v1
+; GFX8-NEXT: v_readfirstlane_b32 s10, v2
+; GFX8-NEXT: v_readfirstlane_b32 s11, v3
+; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
+; GFX8-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
+; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc
+; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5]
+; GFX8-NEXT: s_cbranch_execnz .LBB18_4
+; GFX8-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1
+; GFX8-NEXT: s_mov_b64 exec, s[12:13]
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6
+; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GFX8-NEXT: v_mov_b32_e32 v6, v4
+; GFX8-NEXT: buffer_wbinvl1
+; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7]
+; GFX8-NEXT: s_cbranch_execnz .LBB18_3
+; GFX8-NEXT: ; %bb.6: ; %atomicrmw.end
+; GFX8-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX8-NEXT: v_lshrrev_b32_e32 v0, v7, v4
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x200, v4
+; GFX7-NEXT: v_and_b32_e32 v8, -4, v4
+; GFX7-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v7, 3, v4
+; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v7
+; GFX7-NEXT: v_not_b32_e32 v9, v4
+; GFX7-NEXT: s_mov_b64 s[6:7], exec
+; GFX7-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: v_readfirstlane_b32 s8, v0
+; GFX7-NEXT: v_readfirstlane_b32 s9, v1
+; GFX7-NEXT: v_readfirstlane_b32 s10, v2
+; GFX7-NEXT: v_readfirstlane_b32 s11, v3
+; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
+; GFX7-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
+; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
+; GFX7-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen
+; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5]
+; GFX7-NEXT: s_cbranch_execnz .LBB18_1
+; GFX7-NEXT: ; %bb.2:
+; GFX7-NEXT: s_mov_b64 exec, s[6:7]
+; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v5
+; GFX7-NEXT: s_mov_b64 s[6:7], 0
+; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v4
+; GFX7-NEXT: .LBB18_3: ; %atomicrmw.start
+; GFX7-NEXT: ; =>This Loop Header: Depth=1
+; GFX7-NEXT: ; Child Loop BB18_4 Depth 2
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_lshrrev_b32_e32 v4, v7, v6
+; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX7-NEXT: v_add_f32_e32 v4, v4, v10
+; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v4, v7, v4
+; GFX7-NEXT: v_and_b32_e32 v5, v6, v9
+; GFX7-NEXT: v_or_b32_e32 v5, v5, v4
+; GFX7-NEXT: v_mov_b32_e32 v4, v5
+; GFX7-NEXT: s_mov_b64 s[12:13], exec
+; GFX7-NEXT: v_mov_b32_e32 v5, v6
+; GFX7-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1
+; GFX7-NEXT: ; => This Inner Loop Header: Depth=2
+; GFX7-NEXT: v_readfirstlane_b32 s8, v0
+; GFX7-NEXT: v_readfirstlane_b32 s9, v1
+; GFX7-NEXT: v_readfirstlane_b32 s10, v2
+; GFX7-NEXT: v_readfirstlane_b32 s11, v3
+; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
+; GFX7-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
+; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc
+; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5]
+; GFX7-NEXT: s_cbranch_execnz .LBB18_4
+; GFX7-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1
+; GFX7-NEXT: s_mov_b64 exec, s[12:13]
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6
+; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v6, v4
+; GFX7-NEXT: buffer_wbinvl1
+; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7]
+; GFX7-NEXT: s_cbranch_execnz .LBB18_3
+; GFX7-NEXT: ; %bb.6: ; %atomicrmw.end
+; GFX7-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, v7, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0x200, v4
+; GFX6-NEXT: v_and_b32_e32 v8, -4, v4
+; GFX6-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX6-NEXT: v_lshlrev_b32_e32 v7, 3, v4
+; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v7
+; GFX6-NEXT: v_not_b32_e32 v9, v4
+; GFX6-NEXT: s_mov_b64 s[6:7], exec
+; GFX6-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1
+; GFX6-NEXT: v_readfirstlane_b32 s8, v0
+; GFX6-NEXT: v_readfirstlane_b32 s9, v1
+; GFX6-NEXT: v_readfirstlane_b32 s10, v2
+; GFX6-NEXT: v_readfirstlane_b32 s11, v3
+; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
+; GFX6-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
+; GFX6-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
+; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
+; GFX6-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen
+; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5]
+; GFX6-NEXT: s_cbranch_execnz .LBB18_1
+; GFX6-NEXT: ; %bb.2:
+; GFX6-NEXT: s_mov_b64 exec, s[6:7]
+; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v5
+; GFX6-NEXT: s_mov_b64 s[6:7], 0
+; GFX6-NEXT: v_and_b32_e32 v10, 0xffff0000, v4
+; GFX6-NEXT: .LBB18_3: ; %atomicrmw.start
+; GFX6-NEXT: ; =>This Loop Header: Depth=1
+; GFX6-NEXT: ; Child Loop BB18_4 Depth 2
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-NEXT: v_lshrrev_b32_e32 v4, v7, v6
+; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX6-NEXT: v_add_f32_e32 v4, v4, v10
+; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX6-NEXT: v_lshlrev_b32_e32 v4, v7, v4
+; GFX6-NEXT: v_and_b32_e32 v5, v6, v9
+; GFX6-NEXT: v_or_b32_e32 v5, v5, v4
+; GFX6-NEXT: v_mov_b32_e32 v4, v5
+; GFX6-NEXT: s_mov_b64 s[12:13], exec
+; GFX6-NEXT: v_mov_b32_e32 v5, v6
+; GFX6-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1
+; GFX6-NEXT: ; => This Inner Loop Header: Depth=2
+; GFX6-NEXT: v_readfirstlane_b32 s8, v0
+; GFX6-NEXT: v_readfirstlane_b32 s9, v1
+; GFX6-NEXT: v_readfirstlane_b32 s10, v2
+; GFX6-NEXT: v_readfirstlane_b32 s11, v3
+; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
+; GFX6-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
+; GFX6-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
+; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc
+; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5]
+; GFX6-NEXT: s_cbranch_execnz .LBB18_4
+; GFX6-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1
+; GFX6-NEXT: s_mov_b64 exec, s[12:13]
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6
+; GFX6-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v6, v4
+; GFX6-NEXT: buffer_wbinvl1
+; GFX6-NEXT: s_andn2_b64 exec, exec, s[6:7]
+; GFX6-NEXT: s_cbranch_execnz .LBB18_3
+; GFX6-NEXT: ; %bb.6: ; %atomicrmw.end
+; GFX6-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX6-NEXT: v_lshrrev_b32_e32 v0, v7, v4
+; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: s_setpc_b64 s[30:31]
+ %gep = getelementptr bfloat, ptr addrspace(7) %ptr, i32 256
+ %result = atomicrmw fadd ptr addrspace(7) %gep, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+ ret bfloat %result
+}
+
+; --------------------------------------------------------------------
+; <2 x half>
+; --------------------------------------------------------------------
+
+define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, <2 x half> %val) #0 {
+; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no_fine_grained_memory:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_mov_b32_e32 v1, s4
+; GFX12-NEXT: s_wait_storecnt 0x0
+; GFX12-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no_fine_grained_memory:
+; GFX940: ; %bb.0:
+; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-NEXT: buffer_wbl2 sc1
+; GFX940-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[0:3], 0 offen offset:1024 sc0
+; GFX940-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NEXT: buffer_inv sc1
+; GFX940-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no_fine_grained_memory:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_mov_b32_e32 v2, v0
+; GFX11-NEXT: v_mov_b32_e32 v0, s4
+; GFX11-NEXT: s_addk_i32 s4, 0x400
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: v_mov_b32_e32 v3, s4
+; GFX11-NEXT: s_mov_b32 s4, 0
+; GFX11-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024
+; GFX11-NEXT: .LBB19_1: ; %atomicrmw.start
+; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_mov_b32_e32 v5, v0
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_pk_add_f16 v4, v5, v2
+; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], 0 offen glc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: buffer_gl1_inv
+; GFX11-NEXT: buffer_gl0_inv
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5
+; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX11-NEXT: s_cbranch_execnz .LBB19_1
+; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no_fine_grained_memory:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_mov_b32_e32 v2, v0
+; GFX10-NEXT: v_mov_b32_e32 v0, s8
+; GFX10-NEXT: s_addk_i32 s8, 0x400
+; GFX10-NEXT: v_mov_b32_e32 v3, s8
+; GFX10-NEXT: s_mov_b32 s8, 0
+; GFX10-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024
+; GFX10-NEXT: .LBB19_1: ; %atomicrmw.start
+; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_mov_b32_e32 v5, v0
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: v_pk_add_f16 v4, v5, v2
+; GFX10-NEXT: v_mov_b32_e32 v0, v4
+; GFX10-NEXT: v_mov_b32_e32 v1, v5
+; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: buffer_gl1_inv
+; GFX10-NEXT: buffer_gl0_inv
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5
+; GFX10-NEXT: s_or_b32 s8, vcc_lo, s8
+; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s8
+; GFX10-NEXT: s_cbranch_execnz .LBB19_1
+; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s8
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no_fine_grained_memory:
+; GFX90A: ; %bb.0:
+; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v1, s8
+; GFX90A-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[4:7], 0 offen offset:1024 glc
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: buffer_wbinvl1
+; GFX90A-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no_fine_grained_memory:
+; GFX908: ; %bb.0:
+; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT: v_mov_b32_e32 v2, v0
+; GFX908-NEXT: v_mov_b32_e32 v0, s8
+; GFX908-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024
+; GFX908-NEXT: s_add_i32 s10, s8, 0x400
+; GFX908-NEXT: s_mov_b64 s[8:9], 0
+; GFX908-NEXT: v_mov_b32_e32 v3, s10
+; GFX908-NEXT: .LBB19_1: ; %atomicrmw.start
+; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: v_mov_b32_e32 v5, v0
+; GFX908-NEXT: v_pk_add_f16 v4, v5, v2
+; GFX908-NEXT: v_mov_b32_e32 v0, v4
+; GFX908-NEXT: v_mov_b32_e32 v1, v5
+; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: buffer_wbinvl1
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
+; GFX908-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; GFX908-NEXT: s_andn2_b64 exec, exec, s[8:9]
+; GFX908-NEXT: s_cbranch_execnz .LBB19_1
+; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT: s_or_b64 exec, exec, s[8:9]
+; GFX908-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no_fine_grained_memory:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v2, v0
+; GFX8-NEXT: v_mov_b32_e32 v0, s8
+; GFX8-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024
+; GFX8-NEXT: s_add_i32 s10, s8, 0x400
+; GFX8-NEXT: s_mov_b64 s[8:9], 0
+; GFX8-NEXT: v_mov_b32_e32 v3, s10
+; GFX8-NEXT: .LBB19_1: ; %atomicrmw.start
+; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v5, v0
+; GFX8-NEXT: v_add_f16_sdwa v0, v5, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_add_f16_e32 v1, v5, v2
+; GFX8-NEXT: v_or_b32_e32 v4, v1, v0
+; GFX8-NEXT: v_mov_b32_e32 v0, v4
+; GFX8-NEXT: v_mov_b32_e32 v1, v5
+; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: buffer_wbinvl1
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
+; GFX8-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; GFX8-NEXT: s_andn2_b64 exec, exec, s[8:9]
+; GFX8-NEXT: s_cbranch_execnz .LBB19_1
+; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT: s_or_b64 exec, exec, s[8:9]
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no_fine_grained_memory:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v2, s8
+; GFX7-NEXT: buffer_load_dword v3, v2, s[4:7], 0 offen offset:1024
+; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v0
+; GFX7-NEXT: s_add_i32 s10, s8, 0x400
+; GFX7-NEXT: s_mov_b64 s[8:9], 0
+; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v1
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v3
+; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v3
+; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v4
+; GFX7-NEXT: v_mov_b32_e32 v4, s10
+; GFX7-NEXT: .LBB19_1: ; %atomicrmw.start
+; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v1
+; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_add_f32_e32 v5, v5, v2
+; GFX7-NEXT: v_add_f32_e32 v6, v6, v3
+; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5
+; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v6
+; GFX7-NEXT: v_or_b32_e32 v6, v0, v1
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v5
+; GFX7-NEXT: v_or_b32_e32 v5, v7, v0
+; GFX7-NEXT: v_mov_b32_e32 v8, v6
+; GFX7-NEXT: v_mov_b32_e32 v7, v5
+; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v4, s[4:7], 0 offen glc
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: buffer_wbinvl1
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v7
+; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v7
+; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6
+; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
+; GFX7-NEXT: s_cbranch_execnz .LBB19_1
+; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no_fine_grained_memory:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v2, s8
+; GFX6-NEXT: buffer_load_dword v3, v2, s[4:7], 0 offen offset:1024
+; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v0
+; GFX6-NEXT: s_add_i32 s10, s8, 0x400
+; GFX6-NEXT: s_mov_b64 s[8:9], 0
+; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v1
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v3
+; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v3
+; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v4
+; GFX6-NEXT: v_mov_b32_e32 v4, s10
+; GFX6-NEXT: .LBB19_1: ; %atomicrmw.start
+; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v1
+; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v0
+; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX6-NEXT: v_add_f32_e32 v5, v5, v2
+; GFX6-NEXT: v_add_f32_e32 v6, v6, v3
+; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v6
+; GFX6-NEXT: v_or_b32_e32 v6, v0, v1
+; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v5
+; GFX6-NEXT: v_or_b32_e32 v5, v7, v0
+; GFX6-NEXT: v_mov_b32_e32 v8, v6
+; GFX6-NEXT: v_mov_b32_e32 v7, v5
+; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v4, s[4:7], 0 offen glc
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: buffer_wbinvl1
+; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v7
+; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v7
+; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6
+; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
+; GFX6-NEXT: s_cbranch_execnz .LBB19_1
+; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: s_setpc_b64 s[30:31]
+ %gep = getelementptr <2 x half>, ptr addrspace(7) %ptr, i32 256
+ %result = atomicrmw fadd ptr addrspace(7) %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+ ret <2 x half> %result
+}
+
+define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, <2 x half> %val) #0 {
+; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_fine_grained_memory:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_mov_b32_e32 v1, s4
+; GFX12-NEXT: s_wait_storecnt 0x0
+; GFX12-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[0:3], null offen offset:1024
+; GFX12-NEXT: s_wait_storecnt 0x0
+; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_fine_grained_memory:
+; GFX940: ; %bb.0:
+; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-NEXT: buffer_wbl2 sc1
+; GFX940-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[0:3], 0 offen offset:1024
+; GFX940-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NEXT: buffer_inv sc1
+; GFX940-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_fine_grained_memory:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_mov_b32_e32 v1, s4
+; GFX11-NEXT: s_addk_i32 s4, 0x400
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: v_mov_b32_e32 v3, s4
+; GFX11-NEXT: s_mov_b32 s4, 0
+; GFX11-NEXT: buffer_load_b32 v2, v1, s[0:3], 0 offen offset:1024
+; GFX11-NEXT: .LBB20_1: ; %atomicrmw.start
+; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_pk_add_f16 v1, v2, v0
+; GFX11-NEXT: v_mov_b32_e32 v5, v2
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT: v_mov_b32_e32 v4, v1
+; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], 0 offen glc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: buffer_gl1_inv
+; GFX11-NEXT: buffer_gl0_inv
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2
+; GFX11-NEXT: v_mov_b32_e32 v2, v4
+; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX11-NEXT: s_cbranch_execnz .LBB20_1
+; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_fine_grained_memory:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_mov_b32_e32 v1, s8
+; GFX10-NEXT: s_addk_i32 s8, 0x400
+; GFX10-NEXT: v_mov_b32_e32 v3, s8
+; GFX10-NEXT: s_mov_b32 s8, 0
+; GFX10-NEXT: buffer_load_dword v2, v1, s[4:7], 0 offen offset:1024
+; GFX10-NEXT: .LBB20_1: ; %atomicrmw.start
+; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_pk_add_f16 v1, v2, v0
+; GFX10-NEXT: v_mov_b32_e32 v5, v2
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: v_mov_b32_e32 v4, v1
+; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[4:7], 0 offen glc
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: buffer_gl1_inv
+; GFX10-NEXT: buffer_gl0_inv
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2
+; GFX10-NEXT: v_mov_b32_e32 v2, v4
+; GFX10-NEXT: s_or_b32 s8, vcc_lo, s8
+; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s8
+; GFX10-NEXT: s_cbranch_execnz .LBB20_1
+; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s8
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_fine_grained_memory:
+; GFX90A: ; %bb.0:
+; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v1, s8
+; GFX90A-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[4:7], 0 offen offset:1024
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: buffer_wbinvl1
+; GFX90A-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_fine_grained_memory:
+; GFX908: ; %bb.0:
+; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT: v_mov_b32_e32 v1, s8
+; GFX908-NEXT: buffer_load_dword v2, v1, s[4:7], 0 offen offset:1024
+; GFX908-NEXT: s_add_i32 s10, s8, 0x400
+; GFX908-NEXT: s_mov_b64 s[8:9], 0
+; GFX908-NEXT: v_mov_b32_e32 v3, s10
+; GFX908-NEXT: .LBB20_1: ; %atomicrmw.start
+; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: v_pk_add_f16 v1, v2, v0
+; GFX908-NEXT: v_mov_b32_e32 v5, v2
+; GFX908-NEXT: v_mov_b32_e32 v4, v1
+; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[4:7], 0 offen glc
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: buffer_wbinvl1
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v2
+; GFX908-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; GFX908-NEXT: v_mov_b32_e32 v2, v4
+; GFX908-NEXT: s_andn2_b64 exec, exec, s[8:9]
+; GFX908-NEXT: s_cbranch_execnz .LBB20_1
+; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT: s_or_b64 exec, exec, s[8:9]
+; GFX908-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_fine_grained_memory:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v1, s8
+; GFX8-NEXT: buffer_load_dword v2, v1, s[4:7], 0 offen offset:1024
+; GFX8-NEXT: s_add_i32 s10, s8, 0x400
+; GFX8-NEXT: s_mov_b64 s[8:9], 0
+; GFX8-NEXT: v_mov_b32_e32 v3, s10
+; GFX8-NEXT: .LBB20_1: ; %atomicrmw.start
+; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_add_f16_sdwa v1, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_add_f16_e32 v4, v2, v0
+; GFX8-NEXT: v_or_b32_e32 v1, v4, v1
+; GFX8-NEXT: v_mov_b32_e32 v5, v2
+; GFX8-NEXT: v_mov_b32_e32 v4, v1
+; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[4:7], 0 offen glc
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: buffer_wbinvl1
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v2
+; GFX8-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; GFX8-NEXT: v_mov_b32_e32 v2, v4
+; GFX8-NEXT: s_andn2_b64 exec, exec, s[8:9]
+; GFX8-NEXT: s_cbranch_execnz .LBB20_1
+; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT: s_or_b64 exec, exec, s[8:9]
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_fine_grained_memory:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v2, s8
+; GFX7-NEXT: buffer_load_dword v2, v2, s[4:7], 0 offen offset:1024
+; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v0
+; GFX7-NEXT: s_add_i32 s10, s8, 0x400
+; GFX7-NEXT: s_mov_b64 s[8:9], 0
+; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v1
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v2
+; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v2
+; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v1
+; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v5
+; GFX7-NEXT: v_mov_b32_e32 v2, s10
+; GFX7-NEXT: .LBB20_1: ; %atomicrmw.start
+; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4
+; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v4
+; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v3
+; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX7-NEXT: v_add_f32_e32 v5, v5, v0
+; GFX7-NEXT: v_add_f32_e32 v6, v6, v1
+; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v5
+; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6
+; GFX7-NEXT: v_or_b32_e32 v5, v3, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v7
+; GFX7-NEXT: v_or_b32_e32 v4, v6, v3
+; GFX7-NEXT: v_mov_b32_e32 v7, v5
+; GFX7-NEXT: v_mov_b32_e32 v6, v4
+; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v2, s[4:7], 0 offen glc
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: buffer_wbinvl1
+; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v6
+; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6
+; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5
+; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
+; GFX7-NEXT: s_cbranch_execnz .LBB20_1
+; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_fine_grained_memory:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v2, s8
+; GFX6-NEXT: buffer_load_dword v2, v2, s[4:7], 0 offen offset:1024
+; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v0
+; GFX6-NEXT: s_add_i32 s10, s8, 0x400
+; GFX6-NEXT: s_mov_b64 s[8:9], 0
+; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v1
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v2
+; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v2
+; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v1
+; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v5
+; GFX6-NEXT: v_mov_b32_e32 v2, s10
+; GFX6-NEXT: .LBB20_1: ; %atomicrmw.start
+; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4
+; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v4
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v3
+; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX6-NEXT: v_add_f32_e32 v5, v5, v0
+; GFX6-NEXT: v_add_f32_e32 v6, v6, v1
+; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v5
+; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6
+; GFX6-NEXT: v_or_b32_e32 v5, v3, v4
+; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v7
+; GFX6-NEXT: v_or_b32_e32 v4, v6, v3
+; GFX6-NEXT: v_mov_b32_e32 v7, v5
+; GFX6-NEXT: v_mov_b32_e32 v6, v4
+; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v2, s[4:7], 0 offen glc
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: buffer_wbinvl1
+; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v6
+; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6
+; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5
+; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
+; GFX6-NEXT: s_cbranch_execnz .LBB20_1
+; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: s_setpc_b64 s[30:31]
+ %gep = getelementptr <2 x half>, ptr addrspace(7) %ptr, i32 256
+ %unused = atomicrmw fadd ptr addrspace(7) %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+ ret void
+}
+
+define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory(ptr addrspace(7) %ptr, <2 x half> %val) #0 {
+; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: s_mov_b32 s1, exec_lo
+; GFX12-NEXT: s_wait_storecnt 0x0
+; GFX12-NEXT: .LBB21_1: ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT: v_readfirstlane_b32 s4, v0
+; GFX12-NEXT: v_readfirstlane_b32 s5, v1
+; GFX12-NEXT: v_readfirstlane_b32 s6, v2
+; GFX12-NEXT: v_readfirstlane_b32 s7, v3
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
+; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX12-NEXT: s_and_saveexec_b32 s0, s0
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: buffer_atomic_pk_add_f16 v5, v4, s[4:7], null offen offset:1024 th:TH_ATOMIC_RETURN
+; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
+; GFX12-NEXT: ; implicit-def: $vgpr4
+; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_cbranch_execnz .LBB21_1
+; GFX12-NEXT: ; %bb.2:
+; GFX12-NEXT: s_mov_b32 exec_lo, s1
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: v_mov_b32_e32 v0, v5
+; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory:
+; GFX940: ; %bb.0:
+; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT: s_mov_b64 s[2:3], exec
+; GFX940-NEXT: buffer_wbl2 sc1
+; GFX940-NEXT: .LBB21_1: ; =>This Inner Loop Header: Depth=1
+; GFX940-NEXT: v_readfirstlane_b32 s4, v0
+; GFX940-NEXT: v_readfirstlane_b32 s5, v1
+; GFX940-NEXT: v_readfirstlane_b32 s6, v2
+; GFX940-NEXT: v_readfirstlane_b32 s7, v3
+; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1]
+; GFX940-NEXT: s_nop 0
+; GFX940-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
+; GFX940-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
+; GFX940-NEXT: s_and_saveexec_b64 s[0:1], s[0:1]
+; GFX940-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NEXT: buffer_atomic_pk_add_f16 v5, v4, s[4:7], 0 offen offset:1024 sc0
+; GFX940-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
+; GFX940-NEXT: ; implicit-def: $vgpr4
+; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1]
+; GFX940-NEXT: s_cbranch_execnz .LBB21_1
+; GFX940-NEXT: ; %bb.2:
+; GFX940-NEXT: s_mov_b64 exec, s[2:3]
+; GFX940-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NEXT: v_mov_b32_e32 v0, v5
+; GFX940-NEXT: buffer_inv sc1
+; GFX940-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_add_nc_u32_e32 v9, 0x400, v4
+; GFX11-NEXT: s_mov_b32 s1, 0
+; GFX11-NEXT: s_mov_b32 s2, exec_lo
+; GFX11-NEXT: .LBB21_1: ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT: v_readfirstlane_b32 s4, v0
+; GFX11-NEXT: v_readfirstlane_b32 s5, v1
+; GFX11-NEXT: v_readfirstlane_b32 s6, v2
+; GFX11-NEXT: v_readfirstlane_b32 s7, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
+; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX11-NEXT: s_and_saveexec_b32 s0, s0
+; GFX11-NEXT: buffer_load_b32 v8, v4, s[4:7], 0 offen offset:1024
+; GFX11-NEXT: ; implicit-def: $vgpr4
+; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT: s_cbranch_execnz .LBB21_1
+; GFX11-NEXT: ; %bb.2:
+; GFX11-NEXT: s_mov_b32 exec_lo, s2
+; GFX11-NEXT: .p2align 6
+; GFX11-NEXT: .LBB21_3: ; %atomicrmw.start
+; GFX11-NEXT: ; =>This Loop Header: Depth=1
+; GFX11-NEXT: ; Child Loop BB21_4 Depth 2
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_pk_add_f16 v7, v8, v5
+; GFX11-NEXT: s_mov_b32 s2, exec_lo
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_mov_b32_e32 v6, v7
+; GFX11-NEXT: v_mov_b32_e32 v7, v8
+; GFX11-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1
+; GFX11-NEXT: ; => This Inner Loop Header: Depth=2
+; GFX11-NEXT: v_readfirstlane_b32 s4, v0
+; GFX11-NEXT: v_readfirstlane_b32 s5, v1
+; GFX11-NEXT: v_readfirstlane_b32 s6, v2
+; GFX11-NEXT: v_readfirstlane_b32 s7, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
+; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX11-NEXT: s_and_saveexec_b32 s0, s0
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[6:7], v9, s[4:7], 0 offen glc
+; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT: s_cbranch_execnz .LBB21_4
+; GFX11-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1
+; GFX11-NEXT: s_mov_b32 exec_lo, s2
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v6, v8
+; GFX11-NEXT: v_mov_b32_e32 v8, v6
+; GFX11-NEXT: buffer_gl1_inv
+; GFX11-NEXT: buffer_gl0_inv
+; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX11-NEXT: s_cbranch_execnz .LBB21_3
+; GFX11-NEXT: ; %bb.6: ; %atomicrmw.end
+; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX11-NEXT: v_mov_b32_e32 v0, v6
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_add_nc_u32_e32 v9, 0x400, v4
+; GFX10-NEXT: s_mov_b32 s5, 0
+; GFX10-NEXT: s_mov_b32 s6, exec_lo
+; GFX10-NEXT: .LBB21_1: ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT: v_readfirstlane_b32 s8, v0
+; GFX10-NEXT: v_readfirstlane_b32 s9, v1
+; GFX10-NEXT: v_readfirstlane_b32 s10, v2
+; GFX10-NEXT: v_readfirstlane_b32 s11, v3
+; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[8:9], v[0:1]
+; GFX10-NEXT: v_cmp_eq_u64_e64 s4, s[10:11], v[2:3]
+; GFX10-NEXT: s_and_b32 s4, vcc_lo, s4
+; GFX10-NEXT: s_and_saveexec_b32 s4, s4
+; GFX10-NEXT: buffer_load_dword v8, v4, s[8:11], 0 offen offset:1024
+; GFX10-NEXT: ; implicit-def: $vgpr4
+; GFX10-NEXT: s_waitcnt_depctr 0xffe3
+; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT: s_cbranch_execnz .LBB21_1
+; GFX10-NEXT: ; %bb.2:
+; GFX10-NEXT: s_mov_b32 exec_lo, s6
+; GFX10-NEXT: .LBB21_3: ; %atomicrmw.start
+; GFX10-NEXT: ; =>This Loop Header: Depth=1
+; GFX10-NEXT: ; Child Loop BB21_4 Depth 2
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_pk_add_f16 v7, v8, v5
+; GFX10-NEXT: s_mov_b32 s6, exec_lo
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: v_mov_b32_e32 v6, v7
+; GFX10-NEXT: v_mov_b32_e32 v7, v8
+; GFX10-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1
+; GFX10-NEXT: ; => This Inner Loop Header: Depth=2
+; GFX10-NEXT: v_readfirstlane_b32 s8, v0
+; GFX10-NEXT: v_readfirstlane_b32 s9, v1
+; GFX10-NEXT: v_readfirstlane_b32 s10, v2
+; GFX10-NEXT: v_readfirstlane_b32 s11, v3
+; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[8:9], v[0:1]
+; GFX10-NEXT: v_cmp_eq_u64_e64 s4, s[10:11], v[2:3]
+; GFX10-NEXT: s_and_b32 s4, vcc_lo, s4
+; GFX10-NEXT: s_and_saveexec_b32 s4, s4
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: buffer_atomic_cmpswap v[6:7], v9, s[8:11], 0 offen glc
+; GFX10-NEXT: s_waitcnt_depctr 0xffe3
+; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT: s_cbranch_execnz .LBB21_4
+; GFX10-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1
+; GFX10-NEXT: s_mov_b32 exec_lo, s6
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v6, v8
+; GFX10-NEXT: v_mov_b32_e32 v8, v6
+; GFX10-NEXT: buffer_gl1_inv
+; GFX10-NEXT: buffer_gl0_inv
+; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5
+; GFX10-NEXT: s_waitcnt_depctr 0xffe3
+; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5
+; GFX10-NEXT: s_cbranch_execnz .LBB21_3
+; GFX10-NEXT: ; %bb.6: ; %atomicrmw.end
+; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5
+; GFX10-NEXT: v_mov_b32_e32 v0, v6
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory:
+; GFX90A: ; %bb.0:
+; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: s_mov_b64 s[6:7], exec
+; GFX90A-NEXT: .LBB21_1: ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT: v_readfirstlane_b32 s8, v0
+; GFX90A-NEXT: v_readfirstlane_b32 s9, v1
+; GFX90A-NEXT: v_readfirstlane_b32 s10, v2
+; GFX90A-NEXT: v_readfirstlane_b32 s11, v3
+; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
+; GFX90A-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
+; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: buffer_atomic_pk_add_f16 v5, v4, s[8:11], 0 offen offset:1024 glc
+; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
+; GFX90A-NEXT: ; implicit-def: $vgpr4
+; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: s_cbranch_execnz .LBB21_1
+; GFX90A-NEXT: ; %bb.2:
+; GFX90A-NEXT: s_mov_b64 exec, s[6:7]
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v0, v5
+; GFX90A-NEXT: buffer_wbinvl1
+; GFX90A-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory:
+; GFX908: ; %bb.0:
+; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT: v_add_u32_e32 v9, 0x400, v4
+; GFX908-NEXT: s_mov_b64 s[6:7], exec
+; GFX908-NEXT: .LBB21_1: ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT: v_readfirstlane_b32 s8, v0
+; GFX908-NEXT: v_readfirstlane_b32 s9, v1
+; GFX908-NEXT: v_readfirstlane_b32 s10, v2
+; GFX908-NEXT: v_readfirstlane_b32 s11, v3
+; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
+; GFX908-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
+; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
+; GFX908-NEXT: s_nop 0
+; GFX908-NEXT: buffer_load_dword v8, v4, s[8:11], 0 offen offset:1024
+; GFX908-NEXT: ; implicit-def: $vgpr4
+; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5]
+; GFX908-NEXT: s_cbranch_execnz .LBB21_1
+; GFX908-NEXT: ; %bb.2:
+; GFX908-NEXT: s_mov_b64 exec, s[6:7]
+; GFX908-NEXT: s_mov_b64 s[6:7], 0
+; GFX908-NEXT: .LBB21_3: ; %atomicrmw.start
+; GFX908-NEXT: ; =>This Loop Header: Depth=1
+; GFX908-NEXT: ; Child Loop BB21_4 Depth 2
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: v_pk_add_f16 v7, v8, v5
+; GFX908-NEXT: v_mov_b32_e32 v6, v7
+; GFX908-NEXT: s_mov_b64 s[12:13], exec
+; GFX908-NEXT: v_mov_b32_e32 v7, v8
+; GFX908-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1
+; GFX908-NEXT: ; => This Inner Loop Header: Depth=2
+; GFX908-NEXT: v_readfirstlane_b32 s8, v0
+; GFX908-NEXT: v_readfirstlane_b32 s9, v1
+; GFX908-NEXT: v_readfirstlane_b32 s10, v2
+; GFX908-NEXT: v_readfirstlane_b32 s11, v3
+; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
+; GFX908-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
+; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: buffer_atomic_cmpswap v[6:7], v9, s[8:11], 0 offen glc
+; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5]
+; GFX908-NEXT: s_cbranch_execnz .LBB21_4
+; GFX908-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1
+; GFX908-NEXT: s_mov_b64 exec, s[12:13]
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v6, v8
+; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GFX908-NEXT: v_mov_b32_e32 v8, v6
+; GFX908-NEXT: buffer_wbinvl1
+; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7]
+; GFX908-NEXT: s_cbranch_execnz .LBB21_3
+; GFX908-NEXT: ; %bb.6: ; %atomicrmw.end
+; GFX908-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX908-NEXT: v_mov_b32_e32 v0, v6
+; GFX908-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x400, v4
+; GFX8-NEXT: s_mov_b64 s[6:7], exec
+; GFX8-NEXT: .LBB21_1: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT: v_readfirstlane_b32 s8, v0
+; GFX8-NEXT: v_readfirstlane_b32 s9, v1
+; GFX8-NEXT: v_readfirstlane_b32 s10, v2
+; GFX8-NEXT: v_readfirstlane_b32 s11, v3
+; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
+; GFX8-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
+; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
+; GFX8-NEXT: s_nop 0
+; GFX8-NEXT: buffer_load_dword v8, v4, s[8:11], 0 offen offset:1024
+; GFX8-NEXT: ; implicit-def: $vgpr4
+; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5]
+; GFX8-NEXT: s_cbranch_execnz .LBB21_1
+; GFX8-NEXT: ; %bb.2:
+; GFX8-NEXT: s_mov_b64 exec, s[6:7]
+; GFX8-NEXT: s_mov_b64 s[6:7], 0
+; GFX8-NEXT: .LBB21_3: ; %atomicrmw.start
+; GFX8-NEXT: ; =>This Loop Header: Depth=1
+; GFX8-NEXT: ; Child Loop BB21_4 Depth 2
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_add_f16_sdwa v4, v8, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_add_f16_e32 v6, v8, v5
+; GFX8-NEXT: v_or_b32_e32 v7, v6, v4
+; GFX8-NEXT: v_mov_b32_e32 v6, v7
+; GFX8-NEXT: s_mov_b64 s[12:13], exec
+; GFX8-NEXT: v_mov_b32_e32 v7, v8
+; GFX8-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1
+; GFX8-NEXT: ; => This Inner Loop Header: Depth=2
+; GFX8-NEXT: v_readfirstlane_b32 s8, v0
+; GFX8-NEXT: v_readfirstlane_b32 s9, v1
+; GFX8-NEXT: v_readfirstlane_b32 s10, v2
+; GFX8-NEXT: v_readfirstlane_b32 s11, v3
+; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
+; GFX8-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
+; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: buffer_atomic_cmpswap v[6:7], v9, s[8:11], 0 offen glc
+; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5]
+; GFX8-NEXT: s_cbranch_execnz .LBB21_4
+; GFX8-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1
+; GFX8-NEXT: s_mov_b64 exec, s[12:13]
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v6, v8
+; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GFX8-NEXT: v_mov_b32_e32 v8, v6
+; GFX8-NEXT: buffer_wbinvl1
+; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7]
+; GFX8-NEXT: s_cbranch_execnz .LBB21_3
+; GFX8-NEXT: ; %bb.6: ; %atomicrmw.end
+; GFX8-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX8-NEXT: v_mov_b32_e32 v0, v6
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_add_i32_e32 v9, vcc, 0x400, v4
+; GFX7-NEXT: s_mov_b64 s[6:7], exec
+; GFX7-NEXT: .LBB21_1: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: v_readfirstlane_b32 s8, v0
+; GFX7-NEXT: v_readfirstlane_b32 s9, v1
+; GFX7-NEXT: v_readfirstlane_b32 s10, v2
+; GFX7-NEXT: v_readfirstlane_b32 s11, v3
+; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
+; GFX7-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
+; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
+; GFX7-NEXT: buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024
+; GFX7-NEXT: ; implicit-def: $vgpr4
+; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5]
+; GFX7-NEXT: s_cbranch_execnz .LBB21_1
+; GFX7-NEXT: ; %bb.2:
+; GFX7-NEXT: s_mov_b64 exec, s[6:7]
+; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6
+; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v5
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7
+; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7
+; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5
+; GFX7-NEXT: v_cvt_f32_f16_e32 v10, v6
+; GFX7-NEXT: v_cvt_f32_f16_e32 v11, v8
+; GFX7-NEXT: s_mov_b64 s[6:7], 0
+; GFX7-NEXT: .LBB21_3: ; %atomicrmw.start
+; GFX7-NEXT: ; =>This Loop Header: Depth=1
+; GFX7-NEXT: ; Child Loop BB21_4 Depth 2
+; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5
+; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4
+; GFX7-NEXT: s_mov_b64 s[12:13], exec
+; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5
+; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX7-NEXT: v_add_f32_e32 v6, v6, v10
+; GFX7-NEXT: v_add_f32_e32 v7, v7, v11
+; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6
+; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7
+; GFX7-NEXT: v_or_b32_e32 v6, v4, v5
+; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8
+; GFX7-NEXT: v_or_b32_e32 v5, v7, v4
+; GFX7-NEXT: v_mov_b32_e32 v8, v6
+; GFX7-NEXT: v_mov_b32_e32 v7, v5
+; GFX7-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1
+; GFX7-NEXT: ; => This Inner Loop Header: Depth=2
+; GFX7-NEXT: v_readfirstlane_b32 s8, v0
+; GFX7-NEXT: v_readfirstlane_b32 s9, v1
+; GFX7-NEXT: v_readfirstlane_b32 s10, v2
+; GFX7-NEXT: v_readfirstlane_b32 s11, v3
+; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
+; GFX7-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
+; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v9, s[8:11], 0 offen glc
+; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5]
+; GFX7-NEXT: s_cbranch_execnz .LBB21_4
+; GFX7-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1
+; GFX7-NEXT: s_mov_b64 exec, s[12:13]
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7
+; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7
+; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6
+; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GFX7-NEXT: buffer_wbinvl1
+; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7]
+; GFX7-NEXT: s_cbranch_execnz .LBB21_3
+; GFX7-NEXT: ; %bb.6: ; %atomicrmw.end
+; GFX7-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v0, v4
+; GFX7-NEXT: v_mov_b32_e32 v1, v5
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__waterfall__amdgpu_no_fine_grained_memory:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT: v_add_i32_e32 v9, vcc, 0x400, v4
+; GFX6-NEXT: s_mov_b64 s[6:7], exec
+; GFX6-NEXT: .LBB21_1: ; =>This Inner Loop Header: Depth=1
+; GFX6-NEXT: v_readfirstlane_b32 s8, v0
+; GFX6-NEXT: v_readfirstlane_b32 s9, v1
+; GFX6-NEXT: v_readfirstlane_b32 s10, v2
+; GFX6-NEXT: v_readfirstlane_b32 s11, v3
+; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
+; GFX6-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
+; GFX6-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
+; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
+; GFX6-NEXT: buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024
+; GFX6-NEXT: ; implicit-def: $vgpr4
+; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5]
+; GFX6-NEXT: s_cbranch_execnz .LBB21_1
+; GFX6-NEXT: ; %bb.2:
+; GFX6-NEXT: s_mov_b64 exec, s[6:7]
+; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6
+; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v5
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v7
+; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v7
+; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5
+; GFX6-NEXT: v_cvt_f32_f16_e32 v10, v6
+; GFX6-NEXT: v_cvt_f32_f16_e32 v11, v8
+; GFX6-NEXT: s_mov_b64 s[6:7], 0
+; GFX6-NEXT: .LBB21_3: ; %atomicrmw.start
+; GFX6-NEXT: ; =>This Loop Header: Depth=1
+; GFX6-NEXT: ; Child Loop BB21_4 Depth 2
+; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5
+; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4
+; GFX6-NEXT: s_mov_b64 s[12:13], exec
+; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v5
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v4
+; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX6-NEXT: v_add_f32_e32 v6, v6, v10
+; GFX6-NEXT: v_add_f32_e32 v7, v7, v11
+; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v6
+; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v7
+; GFX6-NEXT: v_or_b32_e32 v6, v4, v5
+; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v8
+; GFX6-NEXT: v_or_b32_e32 v5, v7, v4
+; GFX6-NEXT: v_mov_b32_e32 v8, v6
+; GFX6-NEXT: v_mov_b32_e32 v7, v5
+; GFX6-NEXT: .LBB21_4: ; Parent Loop BB21_3 Depth=1
+; GFX6-NEXT: ; => This Inner Loop Header: Depth=2
+; GFX6-NEXT: v_readfirstlane_b32 s8, v0
+; GFX6-NEXT: v_readfirstlane_b32 s9, v1
+; GFX6-NEXT: v_readfirstlane_b32 s10, v2
+; GFX6-NEXT: v_readfirstlane_b32 s11, v3
+; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
+; GFX6-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
+; GFX6-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
+; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v9, s[8:11], 0 offen glc
+; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5]
+; GFX6-NEXT: s_cbranch_execnz .LBB21_4
+; GFX6-NEXT: ; %bb.5: ; in Loop: Header=BB21_3 Depth=1
+; GFX6-NEXT: s_mov_b64 exec, s[12:13]
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v7
+; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v7
+; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6
+; GFX6-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GFX6-NEXT: buffer_wbinvl1
+; GFX6-NEXT: s_andn2_b64 exec, exec, s[6:7]
+; GFX6-NEXT: s_cbranch_execnz .LBB21_3
+; GFX6-NEXT: ; %bb.6: ; %atomicrmw.end
+; GFX6-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v0, v4
+; GFX6-NEXT: v_mov_b32_e32 v1, v5
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: s_setpc_b64 s[30:31]
+ %gep = getelementptr <2 x half>, ptr addrspace(7) %ptr, i32 256
+ %result = atomicrmw fadd ptr addrspace(7) %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+ ret <2 x half> %result
+}
+
+define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset(ptr addrspace(7) inreg %ptr, <2 x half> %val) {
+; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_mov_b32_e32 v1, s4
+; GFX12-NEXT: s_wait_storecnt 0x0
+; GFX12-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset:
+; GFX940: ; %bb.0:
+; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-NEXT: buffer_wbl2 sc1
+; GFX940-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[0:3], 0 offen offset:1024 sc0
+; GFX940-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NEXT: buffer_inv sc1
+; GFX940-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_mov_b32_e32 v2, v0
+; GFX11-NEXT: v_mov_b32_e32 v0, s4
+; GFX11-NEXT: s_addk_i32 s4, 0x400
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: v_mov_b32_e32 v3, s4
+; GFX11-NEXT: s_mov_b32 s4, 0
+; GFX11-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024
+; GFX11-NEXT: .LBB22_1: ; %atomicrmw.start
+; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_mov_b32_e32 v5, v0
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_pk_add_f16 v4, v5, v2
+; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], 0 offen glc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: buffer_gl1_inv
+; GFX11-NEXT: buffer_gl0_inv
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5
+; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX11-NEXT: s_cbranch_execnz .LBB22_1
+; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_mov_b32_e32 v2, v0
+; GFX10-NEXT: v_mov_b32_e32 v0, s8
+; GFX10-NEXT: s_addk_i32 s8, 0x400
+; GFX10-NEXT: v_mov_b32_e32 v3, s8
+; GFX10-NEXT: s_mov_b32 s8, 0
+; GFX10-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024
+; GFX10-NEXT: .LBB22_1: ; %atomicrmw.start
+; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_mov_b32_e32 v5, v0
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: v_pk_add_f16 v4, v5, v2
+; GFX10-NEXT: v_mov_b32_e32 v0, v4
+; GFX10-NEXT: v_mov_b32_e32 v1, v5
+; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: buffer_gl1_inv
+; GFX10-NEXT: buffer_gl0_inv
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5
+; GFX10-NEXT: s_or_b32 s8, vcc_lo, s8
+; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s8
+; GFX10-NEXT: s_cbranch_execnz .LBB22_1
+; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s8
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset:
+; GFX90A: ; %bb.0:
+; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v1, s8
+; GFX90A-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[4:7], 0 offen offset:1024 glc
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: buffer_wbinvl1
+; GFX90A-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset:
+; GFX908: ; %bb.0:
+; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT: v_mov_b32_e32 v2, v0
+; GFX908-NEXT: v_mov_b32_e32 v0, s8
+; GFX908-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024
+; GFX908-NEXT: s_add_i32 s10, s8, 0x400
+; GFX908-NEXT: s_mov_b64 s[8:9], 0
+; GFX908-NEXT: v_mov_b32_e32 v3, s10
+; GFX908-NEXT: .LBB22_1: ; %atomicrmw.start
+; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: v_mov_b32_e32 v5, v0
+; GFX908-NEXT: v_pk_add_f16 v4, v5, v2
+; GFX908-NEXT: v_mov_b32_e32 v0, v4
+; GFX908-NEXT: v_mov_b32_e32 v1, v5
+; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: buffer_wbinvl1
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
+; GFX908-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; GFX908-NEXT: s_andn2_b64 exec, exec, s[8:9]
+; GFX908-NEXT: s_cbranch_execnz .LBB22_1
+; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT: s_or_b64 exec, exec, s[8:9]
+; GFX908-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v2, v0
+; GFX8-NEXT: v_mov_b32_e32 v0, s8
+; GFX8-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024
+; GFX8-NEXT: s_add_i32 s10, s8, 0x400
+; GFX8-NEXT: s_mov_b64 s[8:9], 0
+; GFX8-NEXT: v_mov_b32_e32 v3, s10
+; GFX8-NEXT: .LBB22_1: ; %atomicrmw.start
+; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v5, v0
+; GFX8-NEXT: v_add_f16_sdwa v0, v5, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_add_f16_e32 v1, v5, v2
+; GFX8-NEXT: v_or_b32_e32 v4, v1, v0
+; GFX8-NEXT: v_mov_b32_e32 v0, v4
+; GFX8-NEXT: v_mov_b32_e32 v1, v5
+; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: buffer_wbinvl1
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
+; GFX8-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; GFX8-NEXT: s_andn2_b64 exec, exec, s[8:9]
+; GFX8-NEXT: s_cbranch_execnz .LBB22_1
+; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT: s_or_b64 exec, exec, s[8:9]
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v2, s8
+; GFX7-NEXT: buffer_load_dword v3, v2, s[4:7], 0 offen offset:1024
+; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v0
+; GFX7-NEXT: s_add_i32 s10, s8, 0x400
+; GFX7-NEXT: s_mov_b64 s[8:9], 0
+; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v1
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v3
+; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v3
+; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v4
+; GFX7-NEXT: v_mov_b32_e32 v4, s10
+; GFX7-NEXT: .LBB22_1: ; %atomicrmw.start
+; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v1
+; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_add_f32_e32 v5, v5, v2
+; GFX7-NEXT: v_add_f32_e32 v6, v6, v3
+; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5
+; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v6
+; GFX7-NEXT: v_or_b32_e32 v6, v0, v1
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v5
+; GFX7-NEXT: v_or_b32_e32 v5, v7, v0
+; GFX7-NEXT: v_mov_b32_e32 v8, v6
+; GFX7-NEXT: v_mov_b32_e32 v7, v5
+; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v4, s[4:7], 0 offen glc
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: buffer_wbinvl1
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v7
+; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v7
+; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6
+; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
+; GFX7-NEXT: s_cbranch_execnz .LBB22_1
+; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v2, s8
+; GFX6-NEXT: buffer_load_dword v3, v2, s[4:7], 0 offen offset:1024
+; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v0
+; GFX6-NEXT: s_add_i32 s10, s8, 0x400
+; GFX6-NEXT: s_mov_b64 s[8:9], 0
+; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v1
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v3
+; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v3
+; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v4
+; GFX6-NEXT: v_mov_b32_e32 v4, s10
+; GFX6-NEXT: .LBB22_1: ; %atomicrmw.start
+; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v1
+; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v0
+; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX6-NEXT: v_add_f32_e32 v5, v5, v2
+; GFX6-NEXT: v_add_f32_e32 v6, v6, v3
+; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v6
+; GFX6-NEXT: v_or_b32_e32 v6, v0, v1
+; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v5
+; GFX6-NEXT: v_or_b32_e32 v5, v7, v0
+; GFX6-NEXT: v_mov_b32_e32 v8, v6
+; GFX6-NEXT: v_mov_b32_e32 v7, v5
+; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v4, s[4:7], 0 offen glc
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: buffer_wbinvl1
+; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v7
+; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v7
+; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6
+; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
+; GFX6-NEXT: s_cbranch_execnz .LBB22_1
+; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: s_setpc_b64 s[30:31]
+ %gep = getelementptr <2 x half>, ptr addrspace(7) %ptr, i32 256
+ %result = atomicrmw fadd ptr addrspace(7) %gep, <2 x half> %val syncscope("agent") seq_cst
+ ret <2 x half> %result
+}
+
+define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset(ptr addrspace(7) inreg %ptr, <2 x half> %val) {
+; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_mov_b32_e32 v1, s4
+; GFX12-NEXT: s_wait_storecnt 0x0
+; GFX12-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[0:3], null offen offset:1024
+; GFX12-NEXT: s_wait_storecnt 0x0
+; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset:
+; GFX940: ; %bb.0:
+; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-NEXT: buffer_wbl2 sc1
+; GFX940-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[0:3], 0 offen offset:1024
+; GFX940-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NEXT: buffer_inv sc1
+; GFX940-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_mov_b32_e32 v1, s4
+; GFX11-NEXT: s_addk_i32 s4, 0x400
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: v_mov_b32_e32 v3, s4
+; GFX11-NEXT: s_mov_b32 s4, 0
+; GFX11-NEXT: buffer_load_b32 v2, v1, s[0:3], 0 offen offset:1024
+; GFX11-NEXT: .LBB23_1: ; %atomicrmw.start
+; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_pk_add_f16 v1, v2, v0
+; GFX11-NEXT: v_mov_b32_e32 v5, v2
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT: v_mov_b32_e32 v4, v1
+; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], 0 offen glc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: buffer_gl1_inv
+; GFX11-NEXT: buffer_gl0_inv
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2
+; GFX11-NEXT: v_mov_b32_e32 v2, v4
+; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX11-NEXT: s_cbranch_execnz .LBB23_1
+; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_mov_b32_e32 v1, s8
+; GFX10-NEXT: s_addk_i32 s8, 0x400
+; GFX10-NEXT: v_mov_b32_e32 v3, s8
+; GFX10-NEXT: s_mov_b32 s8, 0
+; GFX10-NEXT: buffer_load_dword v2, v1, s[4:7], 0 offen offset:1024
+; GFX10-NEXT: .LBB23_1: ; %atomicrmw.start
+; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_pk_add_f16 v1, v2, v0
+; GFX10-NEXT: v_mov_b32_e32 v5, v2
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: v_mov_b32_e32 v4, v1
+; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[4:7], 0 offen glc
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: buffer_gl1_inv
+; GFX10-NEXT: buffer_gl0_inv
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2
+; GFX10-NEXT: v_mov_b32_e32 v2, v4
+; GFX10-NEXT: s_or_b32 s8, vcc_lo, s8
+; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s8
+; GFX10-NEXT: s_cbranch_execnz .LBB23_1
+; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s8
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset:
+; GFX90A: ; %bb.0:
+; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v1, s8
+; GFX90A-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[4:7], 0 offen offset:1024
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: buffer_wbinvl1
+; GFX90A-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset:
+; GFX908: ; %bb.0:
+; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT: v_mov_b32_e32 v1, s8
+; GFX908-NEXT: buffer_load_dword v2, v1, s[4:7], 0 offen offset:1024
+; GFX908-NEXT: s_add_i32 s10, s8, 0x400
+; GFX908-NEXT: s_mov_b64 s[8:9], 0
+; GFX908-NEXT: v_mov_b32_e32 v3, s10
+; GFX908-NEXT: .LBB23_1: ; %atomicrmw.start
+; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: v_pk_add_f16 v1, v2, v0
+; GFX908-NEXT: v_mov_b32_e32 v5, v2
+; GFX908-NEXT: v_mov_b32_e32 v4, v1
+; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[4:7], 0 offen glc
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: buffer_wbinvl1
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v2
+; GFX908-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; GFX908-NEXT: v_mov_b32_e32 v2, v4
+; GFX908-NEXT: s_andn2_b64 exec, exec, s[8:9]
+; GFX908-NEXT: s_cbranch_execnz .LBB23_1
+; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT: s_or_b64 exec, exec, s[8:9]
+; GFX908-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v1, s8
+; GFX8-NEXT: buffer_load_dword v2, v1, s[4:7], 0 offen offset:1024
+; GFX8-NEXT: s_add_i32 s10, s8, 0x400
+; GFX8-NEXT: s_mov_b64 s[8:9], 0
+; GFX8-NEXT: v_mov_b32_e32 v3, s10
+; GFX8-NEXT: .LBB23_1: ; %atomicrmw.start
+; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_add_f16_sdwa v1, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_add_f16_e32 v4, v2, v0
+; GFX8-NEXT: v_or_b32_e32 v1, v4, v1
+; GFX8-NEXT: v_mov_b32_e32 v5, v2
+; GFX8-NEXT: v_mov_b32_e32 v4, v1
+; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[4:7], 0 offen glc
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: buffer_wbinvl1
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v2
+; GFX8-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; GFX8-NEXT: v_mov_b32_e32 v2, v4
+; GFX8-NEXT: s_andn2_b64 exec, exec, s[8:9]
+; GFX8-NEXT: s_cbranch_execnz .LBB23_1
+; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT: s_or_b64 exec, exec, s[8:9]
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v2, s8
+; GFX7-NEXT: buffer_load_dword v2, v2, s[4:7], 0 offen offset:1024
+; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v0
+; GFX7-NEXT: s_add_i32 s10, s8, 0x400
+; GFX7-NEXT: s_mov_b64 s[8:9], 0
+; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v1
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v2
+; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v2
+; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v1
+; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v5
+; GFX7-NEXT: v_mov_b32_e32 v2, s10
+; GFX7-NEXT: .LBB23_1: ; %atomicrmw.start
+; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4
+; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v4
+; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v3
+; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX7-NEXT: v_add_f32_e32 v5, v5, v0
+; GFX7-NEXT: v_add_f32_e32 v6, v6, v1
+; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v5
+; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6
+; GFX7-NEXT: v_or_b32_e32 v5, v3, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v7
+; GFX7-NEXT: v_or_b32_e32 v4, v6, v3
+; GFX7-NEXT: v_mov_b32_e32 v7, v5
+; GFX7-NEXT: v_mov_b32_e32 v6, v4
+; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v2, s[4:7], 0 offen glc
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: buffer_wbinvl1
+; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v6
+; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6
+; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5
+; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
+; GFX7-NEXT: s_cbranch_execnz .LBB23_1
+; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v2, s8
+; GFX6-NEXT: buffer_load_dword v2, v2, s[4:7], 0 offen offset:1024
+; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v0
+; GFX6-NEXT: s_add_i32 s10, s8, 0x400
+; GFX6-NEXT: s_mov_b64 s[8:9], 0
+; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v1
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v2
+; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v2
+; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v1
+; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v5
+; GFX6-NEXT: v_mov_b32_e32 v2, s10
+; GFX6-NEXT: .LBB23_1: ; %atomicrmw.start
+; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4
+; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v4
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v3
+; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX6-NEXT: v_add_f32_e32 v5, v5, v0
+; GFX6-NEXT: v_add_f32_e32 v6, v6, v1
+; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v5
+; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6
+; GFX6-NEXT: v_or_b32_e32 v5, v3, v4
+; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v7
+; GFX6-NEXT: v_or_b32_e32 v4, v6, v3
+; GFX6-NEXT: v_mov_b32_e32 v7, v5
+; GFX6-NEXT: v_mov_b32_e32 v6, v4
+; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v2, s[4:7], 0 offen glc
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: buffer_wbinvl1
+; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v6
+; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6
+; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5
+; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
+; GFX6-NEXT: s_cbranch_execnz .LBB23_1
+; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: s_setpc_b64 s[30:31]
+ %gep = getelementptr <2 x half>, ptr addrspace(7) %ptr, i32 256
+ %unused = atomicrmw fadd ptr addrspace(7) %gep, <2 x half> %val syncscope("agent") seq_cst
+ ret void
+}
+
+define <2 x half> @buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no_remote_memory(ptr addrspace(7) inreg %ptr, <2 x half> %val) #0 {
+; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no_remote_memory:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_mov_b32_e32 v1, s4
+; GFX12-NEXT: s_wait_storecnt 0x0
+; GFX12-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no_remote_memory:
+; GFX940: ; %bb.0:
+; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-NEXT: buffer_wbl2 sc1
+; GFX940-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[0:3], 0 offen offset:1024 sc0
+; GFX940-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NEXT: buffer_inv sc1
+; GFX940-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no_remote_memory:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_mov_b32_e32 v2, v0
+; GFX11-NEXT: v_mov_b32_e32 v0, s4
+; GFX11-NEXT: s_addk_i32 s4, 0x400
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: v_mov_b32_e32 v3, s4
+; GFX11-NEXT: s_mov_b32 s4, 0
+; GFX11-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024
+; GFX11-NEXT: .LBB24_1: ; %atomicrmw.start
+; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_mov_b32_e32 v5, v0
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_pk_add_f16 v4, v5, v2
+; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], 0 offen glc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: buffer_gl1_inv
+; GFX11-NEXT: buffer_gl0_inv
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5
+; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX11-NEXT: s_cbranch_execnz .LBB24_1
+; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no_remote_memory:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_mov_b32_e32 v2, v0
+; GFX10-NEXT: v_mov_b32_e32 v0, s8
+; GFX10-NEXT: s_addk_i32 s8, 0x400
+; GFX10-NEXT: v_mov_b32_e32 v3, s8
+; GFX10-NEXT: s_mov_b32 s8, 0
+; GFX10-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024
+; GFX10-NEXT: .LBB24_1: ; %atomicrmw.start
+; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_mov_b32_e32 v5, v0
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: v_pk_add_f16 v4, v5, v2
+; GFX10-NEXT: v_mov_b32_e32 v0, v4
+; GFX10-NEXT: v_mov_b32_e32 v1, v5
+; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: buffer_gl1_inv
+; GFX10-NEXT: buffer_gl0_inv
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5
+; GFX10-NEXT: s_or_b32 s8, vcc_lo, s8
+; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s8
+; GFX10-NEXT: s_cbranch_execnz .LBB24_1
+; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s8
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no_remote_memory:
+; GFX90A: ; %bb.0:
+; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v1, s8
+; GFX90A-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[4:7], 0 offen offset:1024 glc
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: buffer_wbinvl1
+; GFX90A-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no_remote_memory:
+; GFX908: ; %bb.0:
+; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT: v_mov_b32_e32 v2, v0
+; GFX908-NEXT: v_mov_b32_e32 v0, s8
+; GFX908-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024
+; GFX908-NEXT: s_add_i32 s10, s8, 0x400
+; GFX908-NEXT: s_mov_b64 s[8:9], 0
+; GFX908-NEXT: v_mov_b32_e32 v3, s10
+; GFX908-NEXT: .LBB24_1: ; %atomicrmw.start
+; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: v_mov_b32_e32 v5, v0
+; GFX908-NEXT: v_pk_add_f16 v4, v5, v2
+; GFX908-NEXT: v_mov_b32_e32 v0, v4
+; GFX908-NEXT: v_mov_b32_e32 v1, v5
+; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: buffer_wbinvl1
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
+; GFX908-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; GFX908-NEXT: s_andn2_b64 exec, exec, s[8:9]
+; GFX908-NEXT: s_cbranch_execnz .LBB24_1
+; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT: s_or_b64 exec, exec, s[8:9]
+; GFX908-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no_remote_memory:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v2, v0
+; GFX8-NEXT: v_mov_b32_e32 v0, s8
+; GFX8-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024
+; GFX8-NEXT: s_add_i32 s10, s8, 0x400
+; GFX8-NEXT: s_mov_b64 s[8:9], 0
+; GFX8-NEXT: v_mov_b32_e32 v3, s10
+; GFX8-NEXT: .LBB24_1: ; %atomicrmw.start
+; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v5, v0
+; GFX8-NEXT: v_add_f16_sdwa v0, v5, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_add_f16_e32 v1, v5, v2
+; GFX8-NEXT: v_or_b32_e32 v4, v1, v0
+; GFX8-NEXT: v_mov_b32_e32 v0, v4
+; GFX8-NEXT: v_mov_b32_e32 v1, v5
+; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[4:7], 0 offen glc
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: buffer_wbinvl1
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
+; GFX8-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; GFX8-NEXT: s_andn2_b64 exec, exec, s[8:9]
+; GFX8-NEXT: s_cbranch_execnz .LBB24_1
+; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT: s_or_b64 exec, exec, s[8:9]
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no_remote_memory:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v2, s8
+; GFX7-NEXT: buffer_load_dword v3, v2, s[4:7], 0 offen offset:1024
+; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v0
+; GFX7-NEXT: s_add_i32 s10, s8, 0x400
+; GFX7-NEXT: s_mov_b64 s[8:9], 0
+; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v1
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v3
+; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v3
+; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v4
+; GFX7-NEXT: v_mov_b32_e32 v4, s10
+; GFX7-NEXT: .LBB24_1: ; %atomicrmw.start
+; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v1
+; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_add_f32_e32 v5, v5, v2
+; GFX7-NEXT: v_add_f32_e32 v6, v6, v3
+; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5
+; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v6
+; GFX7-NEXT: v_or_b32_e32 v6, v0, v1
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v5
+; GFX7-NEXT: v_or_b32_e32 v5, v7, v0
+; GFX7-NEXT: v_mov_b32_e32 v8, v6
+; GFX7-NEXT: v_mov_b32_e32 v7, v5
+; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v4, s[4:7], 0 offen glc
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: buffer_wbinvl1
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v7
+; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v7
+; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6
+; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
+; GFX7-NEXT: s_cbranch_execnz .LBB24_1
+; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2f16__offset__amdgpu_no_remote_memory:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v2, s8
+; GFX6-NEXT: buffer_load_dword v3, v2, s[4:7], 0 offen offset:1024
+; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v0
+; GFX6-NEXT: s_add_i32 s10, s8, 0x400
+; GFX6-NEXT: s_mov_b64 s[8:9], 0
+; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v1
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v3
+; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v3
+; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v4
+; GFX6-NEXT: v_mov_b32_e32 v4, s10
+; GFX6-NEXT: .LBB24_1: ; %atomicrmw.start
+; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v1
+; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v0
+; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX6-NEXT: v_add_f32_e32 v5, v5, v2
+; GFX6-NEXT: v_add_f32_e32 v6, v6, v3
+; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v6
+; GFX6-NEXT: v_or_b32_e32 v6, v0, v1
+; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v5
+; GFX6-NEXT: v_or_b32_e32 v5, v7, v0
+; GFX6-NEXT: v_mov_b32_e32 v8, v6
+; GFX6-NEXT: v_mov_b32_e32 v7, v5
+; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v4, s[4:7], 0 offen glc
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: buffer_wbinvl1
+; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v7
+; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v7
+; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6
+; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
+; GFX6-NEXT: s_cbranch_execnz .LBB24_1
+; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: s_setpc_b64 s[30:31]
+ %gep = getelementptr <2 x half>, ptr addrspace(7) %ptr, i32 256
+ %result = atomicrmw fadd ptr addrspace(7) %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
+ ret <2 x half> %result
+}
+
+define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_remote_memory(ptr addrspace(7) inreg %ptr, <2 x half> %val) #0 {
+; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_remote_memory:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_mov_b32_e32 v1, s4
+; GFX12-NEXT: s_wait_storecnt 0x0
+; GFX12-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[0:3], null offen offset:1024
+; GFX12-NEXT: s_wait_storecnt 0x0
+; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_remote_memory:
+; GFX940: ; %bb.0:
+; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-NEXT: buffer_wbl2 sc1
+; GFX940-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[0:3], 0 offen offset:1024
+; GFX940-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NEXT: buffer_inv sc1
+; GFX940-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_remote_memory:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_mov_b32_e32 v1, s4
+; GFX11-NEXT: s_addk_i32 s4, 0x400
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: v_mov_b32_e32 v3, s4
+; GFX11-NEXT: s_mov_b32 s4, 0
+; GFX11-NEXT: buffer_load_b32 v2, v1, s[0:3], 0 offen offset:1024
+; GFX11-NEXT: .LBB25_1: ; %atomicrmw.start
+; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_pk_add_f16 v1, v2, v0
+; GFX11-NEXT: v_mov_b32_e32 v5, v2
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT: v_mov_b32_e32 v4, v1
+; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], 0 offen glc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: buffer_gl1_inv
+; GFX11-NEXT: buffer_gl0_inv
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2
+; GFX11-NEXT: v_mov_b32_e32 v2, v4
+; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
+; GFX11-NEXT: s_cbranch_execnz .LBB25_1
+; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_remote_memory:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_mov_b32_e32 v1, s8
+; GFX10-NEXT: s_addk_i32 s8, 0x400
+; GFX10-NEXT: v_mov_b32_e32 v3, s8
+; GFX10-NEXT: s_mov_b32 s8, 0
+; GFX10-NEXT: buffer_load_dword v2, v1, s[4:7], 0 offen offset:1024
+; GFX10-NEXT: .LBB25_1: ; %atomicrmw.start
+; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_pk_add_f16 v1, v2, v0
+; GFX10-NEXT: v_mov_b32_e32 v5, v2
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: v_mov_b32_e32 v4, v1
+; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[4:7], 0 offen glc
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: buffer_gl1_inv
+; GFX10-NEXT: buffer_gl0_inv
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v2
+; GFX10-NEXT: v_mov_b32_e32 v2, v4
+; GFX10-NEXT: s_or_b32 s8, vcc_lo, s8
+; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s8
+; GFX10-NEXT: s_cbranch_execnz .LBB25_1
+; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s8
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_remote_memory:
+; GFX90A: ; %bb.0:
+; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v1, s8
+; GFX90A-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[4:7], 0 offen offset:1024
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: buffer_wbinvl1
+; GFX90A-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_remote_memory:
+; GFX908: ; %bb.0:
+; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT: v_mov_b32_e32 v1, s8
+; GFX908-NEXT: buffer_load_dword v2, v1, s[4:7], 0 offen offset:1024
+; GFX908-NEXT: s_add_i32 s10, s8, 0x400
+; GFX908-NEXT: s_mov_b64 s[8:9], 0
+; GFX908-NEXT: v_mov_b32_e32 v3, s10
+; GFX908-NEXT: .LBB25_1: ; %atomicrmw.start
+; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: v_pk_add_f16 v1, v2, v0
+; GFX908-NEXT: v_mov_b32_e32 v5, v2
+; GFX908-NEXT: v_mov_b32_e32 v4, v1
+; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[4:7], 0 offen glc
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: buffer_wbinvl1
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v2
+; GFX908-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; GFX908-NEXT: v_mov_b32_e32 v2, v4
+; GFX908-NEXT: s_andn2_b64 exec, exec, s[8:9]
+; GFX908-NEXT: s_cbranch_execnz .LBB25_1
+; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT: s_or_b64 exec, exec, s[8:9]
+; GFX908-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_remote_memory:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v1, s8
+; GFX8-NEXT: buffer_load_dword v2, v1, s[4:7], 0 offen offset:1024
+; GFX8-NEXT: s_add_i32 s10, s8, 0x400
+; GFX8-NEXT: s_mov_b64 s[8:9], 0
+; GFX8-NEXT: v_mov_b32_e32 v3, s10
+; GFX8-NEXT: .LBB25_1: ; %atomicrmw.start
+; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_add_f16_sdwa v1, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_add_f16_e32 v4, v2, v0
+; GFX8-NEXT: v_or_b32_e32 v1, v4, v1
+; GFX8-NEXT: v_mov_b32_e32 v5, v2
+; GFX8-NEXT: v_mov_b32_e32 v4, v1
+; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[4:7], 0 offen glc
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: buffer_wbinvl1
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v2
+; GFX8-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; GFX8-NEXT: v_mov_b32_e32 v2, v4
+; GFX8-NEXT: s_andn2_b64 exec, exec, s[8:9]
+; GFX8-NEXT: s_cbranch_execnz .LBB25_1
+; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT: s_or_b64 exec, exec, s[8:9]
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_remote_memory:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v2, s8
+; GFX7-NEXT: buffer_load_dword v2, v2, s[4:7], 0 offen offset:1024
+; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v0
+; GFX7-NEXT: s_add_i32 s10, s8, 0x400
+; GFX7-NEXT: s_mov_b64 s[8:9], 0
+; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v1
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v2
+; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v2
+; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v1
+; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v5
+; GFX7-NEXT: v_mov_b32_e32 v2, s10
+; GFX7-NEXT: .LBB25_1: ; %atomicrmw.start
+; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4
+; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v4
+; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v3
+; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX7-NEXT: v_add_f32_e32 v5, v5, v0
+; GFX7-NEXT: v_add_f32_e32 v6, v6, v1
+; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v5
+; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6
+; GFX7-NEXT: v_or_b32_e32 v5, v3, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v7
+; GFX7-NEXT: v_or_b32_e32 v4, v6, v3
+; GFX7-NEXT: v_mov_b32_e32 v7, v5
+; GFX7-NEXT: v_mov_b32_e32 v6, v4
+; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v2, s[4:7], 0 offen glc
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: buffer_wbinvl1
+; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v6
+; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6
+; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5
+; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
+; GFX7-NEXT: s_cbranch_execnz .LBB25_1
+; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2f16__offset__amdgpu_no_remote_memory:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v2, s8
+; GFX6-NEXT: buffer_load_dword v2, v2, s[4:7], 0 offen offset:1024
+; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v0
+; GFX6-NEXT: s_add_i32 s10, s8, 0x400
+; GFX6-NEXT: s_mov_b64 s[8:9], 0
+; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v1
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v2
+; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v2
+; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v1
+; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v5
+; GFX6-NEXT: v_mov_b32_e32 v2, s10
+; GFX6-NEXT: .LBB25_1: ; %atomicrmw.start
+; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4
+; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v4
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v3
+; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX6-NEXT: v_add_f32_e32 v5, v5, v0
+; GFX6-NEXT: v_add_f32_e32 v6, v6, v1
+; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v5
+; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6
+; GFX6-NEXT: v_or_b32_e32 v5, v3, v4
+; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v7
+; GFX6-NEXT: v_or_b32_e32 v4, v6, v3
+; GFX6-NEXT: v_mov_b32_e32 v7, v5
+; GFX6-NEXT: v_mov_b32_e32 v6, v4
+; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v2, s[4:7], 0 offen glc
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: buffer_wbinvl1
+; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v6
+; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6
+; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5
+; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
+; GFX6-NEXT: s_cbranch_execnz .LBB25_1
+; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: s_setpc_b64 s[30:31]
+ %gep = getelementptr <2 x half>, ptr addrspace(7) %ptr, i32 256
+ %unused = atomicrmw fadd ptr addrspace(7) %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
+ ret void
+}
+
+; --------------------------------------------------------------------
+; <2 x bfloat>
+; --------------------------------------------------------------------
+
+define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, <2 x bfloat> %val) #0 {
+; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu_no_fine_grained_memory:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_mov_b32_e32 v1, s4
+; GFX12-NEXT: s_wait_storecnt 0x0
+; GFX12-NEXT: buffer_atomic_pk_add_bf16 v0, v1, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu_no_fine_grained_memory:
+; GFX940: ; %bb.0:
+; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT: v_mov_b32_e32 v1, v0
+; GFX940-NEXT: v_mov_b32_e32 v0, s4
+; GFX940-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024
+; GFX940-NEXT: s_addk_i32 s4, 0x400
+; GFX940-NEXT: s_mov_b64 s[6:7], 0
+; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v1
+; GFX940-NEXT: s_movk_i32 s8, 0x7fff
+; GFX940-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
+; GFX940-NEXT: s_mov_b32 s9, 0x7060302
+; GFX940-NEXT: v_mov_b32_e32 v4, s4
+; GFX940-NEXT: .LBB26_1: ; %atomicrmw.start
+; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX940-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NEXT: v_mov_b32_e32 v7, v0
+; GFX940-NEXT: v_lshlrev_b32_e32 v0, 16, v7
+; GFX940-NEXT: v_and_b32_e32 v1, 0xffff0000, v7
+; GFX940-NEXT: v_add_f32_e32 v0, v0, v2
+; GFX940-NEXT: v_add_f32_e32 v1, v1, v3
+; GFX940-NEXT: v_bfe_u32 v5, v0, 16, 1
+; GFX940-NEXT: v_bfe_u32 v8, v1, 16, 1
+; GFX940-NEXT: v_or_b32_e32 v6, 0x400000, v0
+; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v1
+; GFX940-NEXT: v_add3_u32 v5, v5, v0, s8
+; GFX940-NEXT: v_add3_u32 v8, v8, v1, s8
+; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
+; GFX940-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
+; GFX940-NEXT: buffer_wbl2 sc1
+; GFX940-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc
+; GFX940-NEXT: v_cndmask_b32_e64 v0, v5, v6, s[4:5]
+; GFX940-NEXT: v_perm_b32 v6, v1, v0, s9
+; GFX940-NEXT: v_mov_b64_e32 v[0:1], v[6:7]
+; GFX940-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[0:3], 0 offen sc0
+; GFX940-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NEXT: buffer_inv sc1
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v7
+; GFX940-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GFX940-NEXT: s_andn2_b64 exec, exec, s[6:7]
+; GFX940-NEXT: s_cbranch_execnz .LBB26_1
+; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX940-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX940-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu_no_fine_grained_memory:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s4
+; GFX11-NEXT: s_addk_i32 s4, 0x400
+; GFX11-NEXT: s_mov_b32 s5, 0
+; GFX11-NEXT: v_mov_b32_e32 v4, s4
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v1
+; GFX11-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024
+; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
+; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX11-NEXT: .p2align 6
+; GFX11-NEXT: .LBB26_1: ; %atomicrmw.start
+; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_mov_b32_e32 v6, v0
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v6
+; GFX11-NEXT: v_add_f32_e32 v1, v1, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_bfe_u32 v7, v1, 16, 1
+; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v1
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-NEXT: v_add3_u32 v7, v7, v1, 0x7fff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_dual_cndmask_b32 v1, v7, v9 :: v_dual_lshlrev_b32 v0, 16, v6
+; GFX11-NEXT: v_add_f32_e32 v0, v0, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_bfe_u32 v5, v0, 16, 1
+; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0
+; GFX11-NEXT: v_cmp_u_f32_e64 s4, v0, v0
+; GFX11-NEXT: v_add3_u32 v5, v5, v0, 0x7fff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_cndmask_b32_e64 v0, v5, v8, s4
+; GFX11-NEXT: v_perm_b32 v5, v1, v0, 0x7060302
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_dual_mov_b32 v0, v5 :: v_dual_mov_b32 v1, v6
+; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v4, s[0:3], 0 offen glc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: buffer_gl1_inv
+; GFX11-NEXT: buffer_gl0_inv
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6
+; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
+; GFX11-NEXT: s_cbranch_execnz .LBB26_1
+; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s5
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu_no_fine_grained_memory:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_mov_b32_e32 v1, v0
+; GFX10-NEXT: v_mov_b32_e32 v0, s8
+; GFX10-NEXT: s_addk_i32 s8, 0x400
+; GFX10-NEXT: s_mov_b32 s9, 0
+; GFX10-NEXT: v_mov_b32_e32 v4, s8
+; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v1
+; GFX10-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024
+; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
+; GFX10-NEXT: .LBB26_1: ; %atomicrmw.start
+; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_mov_b32_e32 v6, v0
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v6
+; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v6
+; GFX10-NEXT: v_add_f32_e32 v0, v0, v2
+; GFX10-NEXT: v_add_f32_e32 v1, v1, v3
+; GFX10-NEXT: v_bfe_u32 v5, v0, 16, 1
+; GFX10-NEXT: v_bfe_u32 v7, v1, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v0
+; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v1
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX10-NEXT: v_add3_u32 v5, v5, v0, 0x7fff
+; GFX10-NEXT: v_add3_u32 v7, v7, v1, 0x7fff
+; GFX10-NEXT: v_cmp_u_f32_e64 s8, v0, v0
+; GFX10-NEXT: v_cndmask_b32_e32 v1, v7, v9, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e64 v0, v5, v8, s8
+; GFX10-NEXT: v_perm_b32 v5, v1, v0, 0x7060302
+; GFX10-NEXT: v_mov_b32_e32 v0, v5
+; GFX10-NEXT: v_mov_b32_e32 v1, v6
+; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[4:7], 0 offen glc
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: buffer_gl1_inv
+; GFX10-NEXT: buffer_gl0_inv
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6
+; GFX10-NEXT: s_or_b32 s9, vcc_lo, s9
+; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s9
+; GFX10-NEXT: s_cbranch_execnz .LBB26_1
+; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s9
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu_no_fine_grained_memory:
+; GFX90A: ; %bb.0:
+; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v1, v0
+; GFX90A-NEXT: v_mov_b32_e32 v0, s8
+; GFX90A-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024
+; GFX90A-NEXT: s_addk_i32 s8, 0x400
+; GFX90A-NEXT: s_mov_b64 s[10:11], 0
+; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v1
+; GFX90A-NEXT: s_movk_i32 s12, 0x7fff
+; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
+; GFX90A-NEXT: s_mov_b32 s13, 0x7060302
+; GFX90A-NEXT: v_mov_b32_e32 v4, s8
+; GFX90A-NEXT: .LBB26_1: ; %atomicrmw.start
+; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v7, v0
+; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v7
+; GFX90A-NEXT: v_and_b32_e32 v1, 0xffff0000, v7
+; GFX90A-NEXT: v_add_f32_e32 v0, v0, v2
+; GFX90A-NEXT: v_add_f32_e32 v1, v1, v3
+; GFX90A-NEXT: v_bfe_u32 v5, v0, 16, 1
+; GFX90A-NEXT: v_bfe_u32 v8, v1, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v0
+; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v1
+; GFX90A-NEXT: v_add3_u32 v5, v5, v0, s12
+; GFX90A-NEXT: v_add3_u32 v8, v8, v1, s12
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
+; GFX90A-NEXT: v_cmp_u_f32_e64 s[8:9], v0, v0
+; GFX90A-NEXT: v_cndmask_b32_e64 v0, v5, v6, s[8:9]
+; GFX90A-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc
+; GFX90A-NEXT: v_perm_b32 v6, v1, v0, s13
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[6:7] op_sel:[0,1]
+; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[4:7], 0 offen glc
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: buffer_wbinvl1
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v7
+; GFX90A-NEXT: s_or_b64 s[10:11], vcc, s[10:11]
+; GFX90A-NEXT: s_andn2_b64 exec, exec, s[10:11]
+; GFX90A-NEXT: s_cbranch_execnz .LBB26_1
+; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX90A-NEXT: s_or_b64 exec, exec, s[10:11]
+; GFX90A-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu_no_fine_grained_memory:
+; GFX908: ; %bb.0:
+; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT: v_mov_b32_e32 v1, v0
+; GFX908-NEXT: v_mov_b32_e32 v0, s8
+; GFX908-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024
+; GFX908-NEXT: s_addk_i32 s8, 0x400
+; GFX908-NEXT: s_mov_b64 s[10:11], 0
+; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v1
+; GFX908-NEXT: s_movk_i32 s12, 0x7fff
+; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
+; GFX908-NEXT: s_mov_b32 s13, 0x7060302
+; GFX908-NEXT: v_mov_b32_e32 v4, s8
+; GFX908-NEXT: .LBB26_1: ; %atomicrmw.start
+; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: v_mov_b32_e32 v6, v0
+; GFX908-NEXT: v_lshlrev_b32_e32 v0, 16, v6
+; GFX908-NEXT: v_and_b32_e32 v1, 0xffff0000, v6
+; GFX908-NEXT: v_add_f32_e32 v0, v0, v2
+; GFX908-NEXT: v_add_f32_e32 v1, v1, v3
+; GFX908-NEXT: v_bfe_u32 v5, v0, 16, 1
+; GFX908-NEXT: v_bfe_u32 v8, v1, 16, 1
+; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v0
+; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v1
+; GFX908-NEXT: v_add3_u32 v5, v5, v0, s12
+; GFX908-NEXT: v_add3_u32 v8, v8, v1, s12
+; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
+; GFX908-NEXT: v_cmp_u_f32_e64 s[8:9], v0, v0
+; GFX908-NEXT: v_cndmask_b32_e64 v0, v5, v7, s[8:9]
+; GFX908-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc
+; GFX908-NEXT: v_perm_b32 v5, v1, v0, s13
+; GFX908-NEXT: v_mov_b32_e32 v0, v5
+; GFX908-NEXT: v_mov_b32_e32 v1, v6
+; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[4:7], 0 offen glc
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: buffer_wbinvl1
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6
+; GFX908-NEXT: s_or_b64 s[10:11], vcc, s[10:11]
+; GFX908-NEXT: s_andn2_b64 exec, exec, s[10:11]
+; GFX908-NEXT: s_cbranch_execnz .LBB26_1
+; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT: s_or_b64 exec, exec, s[10:11]
+; GFX908-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu_no_fine_grained_memory:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v1, v0
+; GFX8-NEXT: v_mov_b32_e32 v0, s8
+; GFX8-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024
+; GFX8-NEXT: s_addk_i32 s8, 0x400
+; GFX8-NEXT: s_mov_b64 s[10:11], 0
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v1
+; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
+; GFX8-NEXT: v_mov_b32_e32 v4, s8
+; GFX8-NEXT: .LBB26_1: ; %atomicrmw.start
+; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v6, v0
+; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v6
+; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v6
+; GFX8-NEXT: v_add_f32_e32 v0, v0, v2
+; GFX8-NEXT: v_add_f32_e32 v1, v1, v3
+; GFX8-NEXT: v_bfe_u32 v5, v0, 16, 1
+; GFX8-NEXT: v_bfe_u32 v8, v1, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v0
+; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v1
+; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5
+; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8
+; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v1
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
+; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v0
+; GFX8-NEXT: v_cmp_u_f32_e64 s[8:9], v0, v0
+; GFX8-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v0, v5, v7, s[8:9]
+; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX8-NEXT: v_alignbit_b32 v5, v1, v0, 16
+; GFX8-NEXT: v_mov_b32_e32 v0, v5
+; GFX8-NEXT: v_mov_b32_e32 v1, v6
+; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[4:7], 0 offen glc
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: buffer_wbinvl1
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6
+; GFX8-NEXT: s_or_b64 s[10:11], vcc, s[10:11]
+; GFX8-NEXT: s_andn2_b64 exec, exec, s[10:11]
+; GFX8-NEXT: s_cbranch_execnz .LBB26_1
+; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT: s_or_b64 exec, exec, s[10:11]
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu_no_fine_grained_memory:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v2, s8
+; GFX7-NEXT: buffer_load_dword v4, v2, s[4:7], 0 offen offset:1024
+; GFX7-NEXT: s_addk_i32 s8, 0x400
+; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GFX7-NEXT: s_mov_b64 s[10:11], 0
+; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v0
+; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v4
+; GFX7-NEXT: v_mov_b32_e32 v4, s8
+; GFX7-NEXT: .LBB26_1: ; %atomicrmw.start
+; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v1
+; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_add_f32_e32 v6, v6, v3
+; GFX7-NEXT: v_add_f32_e32 v5, v5, v2
+; GFX7-NEXT: v_alignbit_b32 v1, v1, v0, 16
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v6
+; GFX7-NEXT: v_alignbit_b32 v0, v0, v5, 16
+; GFX7-NEXT: v_mov_b32_e32 v6, v1
+; GFX7-NEXT: v_mov_b32_e32 v5, v0
+; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[4:7], 0 offen glc
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: buffer_wbinvl1
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v5
+; GFX7-NEXT: s_or_b64 s[10:11], vcc, s[10:11]
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v5
+; GFX7-NEXT: s_andn2_b64 exec, exec, s[10:11]
+; GFX7-NEXT: s_cbranch_execnz .LBB26_1
+; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT: s_or_b64 exec, exec, s[10:11]
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu_no_fine_grained_memory:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v2, s8
+; GFX6-NEXT: buffer_load_dword v4, v2, s[4:7], 0 offen offset:1024
+; GFX6-NEXT: s_addk_i32 s8, 0x400
+; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GFX6-NEXT: s_mov_b64 s[10:11], 0
+; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v0
+; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v4
+; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v4
+; GFX6-NEXT: v_mov_b32_e32 v4, s8
+; GFX6-NEXT: .LBB26_1: ; %atomicrmw.start
+; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v1
+; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v0
+; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX6-NEXT: v_add_f32_e32 v6, v6, v3
+; GFX6-NEXT: v_add_f32_e32 v5, v5, v2
+; GFX6-NEXT: v_alignbit_b32 v1, v1, v0, 16
+; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v6
+; GFX6-NEXT: v_alignbit_b32 v0, v0, v5, 16
+; GFX6-NEXT: v_mov_b32_e32 v6, v1
+; GFX6-NEXT: v_mov_b32_e32 v5, v0
+; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[4:7], 0 offen glc
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: buffer_wbinvl1
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1
+; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v5
+; GFX6-NEXT: s_or_b64 s[10:11], vcc, s[10:11]
+; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v5
+; GFX6-NEXT: s_andn2_b64 exec, exec, s[10:11]
+; GFX6-NEXT: s_cbranch_execnz .LBB26_1
+; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX6-NEXT: s_or_b64 exec, exec, s[10:11]
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: s_setpc_b64 s[30:31]
+ %gep = getelementptr <2 x bfloat>, ptr addrspace(7) %ptr, i32 256
+ %result = atomicrmw fadd ptr addrspace(7) %gep, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+ ret <2 x bfloat> %result
+}
+
+define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, <2 x bfloat> %val) #0 {
+; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fine_grained_memory:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_mov_b32_e32 v1, s4
+; GFX12-NEXT: s_wait_storecnt 0x0
+; GFX12-NEXT: buffer_atomic_pk_add_bf16 v0, v1, s[0:3], null offen offset:1024
+; GFX12-NEXT: s_wait_storecnt 0x0
+; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fine_grained_memory:
+; GFX940: ; %bb.0:
+; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen offset:1024
+; GFX940-NEXT: s_addk_i32 s4, 0x400
+; GFX940-NEXT: s_mov_b64 s[6:7], 0
+; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v0
+; GFX940-NEXT: s_movk_i32 s8, 0x7fff
+; GFX940-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
+; GFX940-NEXT: s_mov_b32 s9, 0x7060302
+; GFX940-NEXT: v_mov_b32_e32 v4, s4
+; GFX940-NEXT: .LBB27_1: ; %atomicrmw.start
+; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX940-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NEXT: v_lshlrev_b32_e32 v0, 16, v1
+; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
+; GFX940-NEXT: v_add_f32_e32 v0, v0, v2
+; GFX940-NEXT: v_add_f32_e32 v5, v5, v3
+; GFX940-NEXT: v_bfe_u32 v6, v0, 16, 1
+; GFX940-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX940-NEXT: v_or_b32_e32 v7, 0x400000, v0
+; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX940-NEXT: v_add3_u32 v6, v6, v0, s8
+; GFX940-NEXT: v_add3_u32 v8, v8, v5, s8
+; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX940-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
+; GFX940-NEXT: buffer_wbl2 sc1
+; GFX940-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX940-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5]
+; GFX940-NEXT: v_perm_b32 v0, v5, v0, s9
+; GFX940-NEXT: v_mov_b64_e32 v[6:7], v[0:1]
+; GFX940-NEXT: buffer_atomic_cmpswap v[6:7], v4, s[0:3], 0 offen sc0
+; GFX940-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NEXT: buffer_inv sc1
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v6, v1
+; GFX940-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GFX940-NEXT: v_mov_b32_e32 v1, v6
+; GFX940-NEXT: s_andn2_b64 exec, exec, s[6:7]
+; GFX940-NEXT: s_cbranch_execnz .LBB27_1
+; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX940-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX940-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fine_grained_memory:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_dual_mov_b32 v1, s4 :: v_dual_lshlrev_b32 v2, 16, v0
+; GFX11-NEXT: s_addk_i32 s4, 0x400
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v0
+; GFX11-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024
+; GFX11-NEXT: s_mov_b32 s5, 0
+; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX11-NEXT: .p2align 6
+; GFX11-NEXT: .LBB27_1: ; %atomicrmw.start
+; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
+; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v1
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_dual_add_f32 v5, v5, v3 :: v_dual_add_f32 v0, v0, v2
+; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT: v_bfe_u32 v6, v0, 16, 1
+; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0
+; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-NEXT: v_add3_u32 v6, v6, v0, 0x7fff
+; GFX11-NEXT: v_cmp_u_f32_e64 s4, v0, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX11-NEXT: v_cndmask_b32_e64 v0, v6, v8, s4
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_perm_b32 v0, v5, v0, 0x7060302
+; GFX11-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v5, v0
+; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], 0 offen glc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: buffer_gl1_inv
+; GFX11-NEXT: buffer_gl0_inv
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v1
+; GFX11-NEXT: v_mov_b32_e32 v1, v5
+; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
+; GFX11-NEXT: s_cbranch_execnz .LBB27_1
+; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s5
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fine_grained_memory:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_mov_b32_e32 v1, s8
+; GFX10-NEXT: s_addk_i32 s8, 0x400
+; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v0
+; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
+; GFX10-NEXT: v_mov_b32_e32 v4, s8
+; GFX10-NEXT: buffer_load_dword v1, v1, s[4:7], 0 offen offset:1024
+; GFX10-NEXT: s_mov_b32 s9, 0
+; GFX10-NEXT: .LBB27_1: ; %atomicrmw.start
+; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v1
+; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: v_add_f32_e32 v0, v0, v2
+; GFX10-NEXT: v_add_f32_e32 v5, v5, v3
+; GFX10-NEXT: v_bfe_u32 v6, v0, 16, 1
+; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v0
+; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX10-NEXT: v_add3_u32 v6, v6, v0, 0x7fff
+; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX10-NEXT: v_cmp_u_f32_e64 s8, v0, v0
+; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e64 v0, v6, v8, s8
+; GFX10-NEXT: v_perm_b32 v0, v5, v0, 0x7060302
+; GFX10-NEXT: v_mov_b32_e32 v6, v1
+; GFX10-NEXT: v_mov_b32_e32 v5, v0
+; GFX10-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[4:7], 0 offen glc
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: buffer_gl1_inv
+; GFX10-NEXT: buffer_gl0_inv
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v1
+; GFX10-NEXT: v_mov_b32_e32 v1, v5
+; GFX10-NEXT: s_or_b32 s9, vcc_lo, s9
+; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s9
+; GFX10-NEXT: s_cbranch_execnz .LBB27_1
+; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s9
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fine_grained_memory:
+; GFX90A: ; %bb.0:
+; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v1, s8
+; GFX90A-NEXT: buffer_load_dword v1, v1, s[4:7], 0 offen offset:1024
+; GFX90A-NEXT: s_addk_i32 s8, 0x400
+; GFX90A-NEXT: s_mov_b64 s[10:11], 0
+; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v0
+; GFX90A-NEXT: s_movk_i32 s12, 0x7fff
+; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
+; GFX90A-NEXT: s_mov_b32 s13, 0x7060302
+; GFX90A-NEXT: v_mov_b32_e32 v4, s8
+; GFX90A-NEXT: .LBB27_1: ; %atomicrmw.start
+; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v1
+; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
+; GFX90A-NEXT: v_add_f32_e32 v0, v0, v2
+; GFX90A-NEXT: v_add_f32_e32 v5, v5, v3
+; GFX90A-NEXT: v_bfe_u32 v6, v0, 16, 1
+; GFX90A-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v0
+; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX90A-NEXT: v_add3_u32 v6, v6, v0, s12
+; GFX90A-NEXT: v_add3_u32 v8, v8, v5, s12
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX90A-NEXT: v_cmp_u_f32_e64 s[8:9], v0, v0
+; GFX90A-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[8:9]
+; GFX90A-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX90A-NEXT: v_perm_b32 v0, v5, v0, s13
+; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[0:1], v[0:1] op_sel:[0,1]
+; GFX90A-NEXT: buffer_atomic_cmpswap v[6:7], v4, s[4:7], 0 offen glc
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: buffer_wbinvl1
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v6, v1
+; GFX90A-NEXT: s_or_b64 s[10:11], vcc, s[10:11]
+; GFX90A-NEXT: v_mov_b32_e32 v1, v6
+; GFX90A-NEXT: s_andn2_b64 exec, exec, s[10:11]
+; GFX90A-NEXT: s_cbranch_execnz .LBB27_1
+; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX90A-NEXT: s_or_b64 exec, exec, s[10:11]
+; GFX90A-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fine_grained_memory:
+; GFX908: ; %bb.0:
+; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT: v_mov_b32_e32 v1, s8
+; GFX908-NEXT: buffer_load_dword v1, v1, s[4:7], 0 offen offset:1024
+; GFX908-NEXT: s_addk_i32 s8, 0x400
+; GFX908-NEXT: s_mov_b64 s[10:11], 0
+; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v0
+; GFX908-NEXT: s_movk_i32 s12, 0x7fff
+; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
+; GFX908-NEXT: s_mov_b32 s13, 0x7060302
+; GFX908-NEXT: v_mov_b32_e32 v4, s8
+; GFX908-NEXT: .LBB27_1: ; %atomicrmw.start
+; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: v_lshlrev_b32_e32 v0, 16, v1
+; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
+; GFX908-NEXT: v_add_f32_e32 v0, v0, v2
+; GFX908-NEXT: v_add_f32_e32 v5, v5, v3
+; GFX908-NEXT: v_bfe_u32 v6, v0, 16, 1
+; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v0
+; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX908-NEXT: v_add3_u32 v6, v6, v0, s12
+; GFX908-NEXT: v_add3_u32 v8, v8, v5, s12
+; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX908-NEXT: v_cmp_u_f32_e64 s[8:9], v0, v0
+; GFX908-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[8:9]
+; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX908-NEXT: v_perm_b32 v0, v5, v0, s13
+; GFX908-NEXT: v_mov_b32_e32 v6, v1
+; GFX908-NEXT: v_mov_b32_e32 v5, v0
+; GFX908-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[4:7], 0 offen glc
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: buffer_wbinvl1
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1
+; GFX908-NEXT: s_or_b64 s[10:11], vcc, s[10:11]
+; GFX908-NEXT: v_mov_b32_e32 v1, v5
+; GFX908-NEXT: s_andn2_b64 exec, exec, s[10:11]
+; GFX908-NEXT: s_cbranch_execnz .LBB27_1
+; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT: s_or_b64 exec, exec, s[10:11]
+; GFX908-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fine_grained_memory:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v1, s8
+; GFX8-NEXT: buffer_load_dword v1, v1, s[4:7], 0 offen offset:1024
+; GFX8-NEXT: s_addk_i32 s8, 0x400
+; GFX8-NEXT: s_mov_b64 s[10:11], 0
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v0
+; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
+; GFX8-NEXT: v_mov_b32_e32 v4, s8
+; GFX8-NEXT: .LBB27_1: ; %atomicrmw.start
+; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v1
+; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
+; GFX8-NEXT: v_add_f32_e32 v0, v0, v2
+; GFX8-NEXT: v_add_f32_e32 v5, v5, v3
+; GFX8-NEXT: v_bfe_u32 v6, v0, 16, 1
+; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v0
+; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5
+; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6
+; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8
+; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v0
+; GFX8-NEXT: v_cmp_u_f32_e64 s[8:9], v0, v0
+; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[8:9]
+; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX8-NEXT: v_alignbit_b32 v0, v5, v0, 16
+; GFX8-NEXT: v_mov_b32_e32 v6, v1
+; GFX8-NEXT: v_mov_b32_e32 v5, v0
+; GFX8-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[4:7], 0 offen glc
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: buffer_wbinvl1
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1
+; GFX8-NEXT: s_or_b64 s[10:11], vcc, s[10:11]
+; GFX8-NEXT: v_mov_b32_e32 v1, v5
+; GFX8-NEXT: s_andn2_b64 exec, exec, s[10:11]
+; GFX8-NEXT: s_cbranch_execnz .LBB27_1
+; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT: s_or_b64 exec, exec, s[10:11]
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fine_grained_memory:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v2, s8
+; GFX7-NEXT: buffer_load_dword v2, v2, s[4:7], 0 offen offset:1024
+; GFX7-NEXT: s_addk_i32 s8, 0x400
+; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GFX7-NEXT: s_mov_b64 s[10:11], 0
+; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
+; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX7-NEXT: v_mov_b32_e32 v2, s8
+; GFX7-NEXT: .LBB27_1: ; %atomicrmw.start
+; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_add_f32_e32 v6, v6, v1
+; GFX7-NEXT: v_add_f32_e32 v5, v5, v0
+; GFX7-NEXT: v_alignbit_b32 v4, v3, v4, 16
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v6
+; GFX7-NEXT: v_alignbit_b32 v3, v3, v5, 16
+; GFX7-NEXT: v_mov_b32_e32 v6, v4
+; GFX7-NEXT: v_mov_b32_e32 v5, v3
+; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v2, s[4:7], 0 offen glc
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: buffer_wbinvl1
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4
+; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v5
+; GFX7-NEXT: s_or_b64 s[10:11], vcc, s[10:11]
+; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v5
+; GFX7-NEXT: s_andn2_b64 exec, exec, s[10:11]
+; GFX7-NEXT: s_cbranch_execnz .LBB27_1
+; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT: s_or_b64 exec, exec, s[10:11]
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fine_grained_memory:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v2, s8
+; GFX6-NEXT: buffer_load_dword v2, v2, s[4:7], 0 offen offset:1024
+; GFX6-NEXT: s_addk_i32 s8, 0x400
+; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GFX6-NEXT: s_mov_b64 s[10:11], 0
+; GFX6-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
+; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX6-NEXT: v_mov_b32_e32 v2, s8
+; GFX6-NEXT: .LBB27_1: ; %atomicrmw.start
+; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
+; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX6-NEXT: v_add_f32_e32 v6, v6, v1
+; GFX6-NEXT: v_add_f32_e32 v5, v5, v0
+; GFX6-NEXT: v_alignbit_b32 v4, v3, v4, 16
+; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v6
+; GFX6-NEXT: v_alignbit_b32 v3, v3, v5, 16
+; GFX6-NEXT: v_mov_b32_e32 v6, v4
+; GFX6-NEXT: v_mov_b32_e32 v5, v3
+; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v2, s[4:7], 0 offen glc
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: buffer_wbinvl1
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4
+; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v5
+; GFX6-NEXT: s_or_b64 s[10:11], vcc, s[10:11]
+; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v5
+; GFX6-NEXT: s_andn2_b64 exec, exec, s[10:11]
+; GFX6-NEXT: s_cbranch_execnz .LBB27_1
+; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX6-NEXT: s_or_b64 exec, exec, s[10:11]
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: s_setpc_b64 s[30:31]
+ %gep = getelementptr <2 x bfloat>, ptr addrspace(7) %ptr, i32 256
+ %unused = atomicrmw fadd ptr addrspace(7) %gep, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+ ret void
+}
+
+define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory(ptr addrspace(7) %ptr, <2 x bfloat> %val) #0 {
+; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: s_mov_b32 s1, exec_lo
+; GFX12-NEXT: s_wait_storecnt 0x0
+; GFX12-NEXT: .LBB28_1: ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT: v_readfirstlane_b32 s4, v0
+; GFX12-NEXT: v_readfirstlane_b32 s5, v1
+; GFX12-NEXT: v_readfirstlane_b32 s6, v2
+; GFX12-NEXT: v_readfirstlane_b32 s7, v3
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
+; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX12-NEXT: s_and_saveexec_b32 s0, s0
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: buffer_atomic_pk_add_bf16 v5, v4, s[4:7], null offen offset:1024 th:TH_ATOMIC_RETURN
+; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
+; GFX12-NEXT: ; implicit-def: $vgpr4
+; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_cbranch_execnz .LBB28_1
+; GFX12-NEXT: ; %bb.2:
+; GFX12-NEXT: s_mov_b32 exec_lo, s1
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: v_mov_b32_e32 v0, v5
+; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
+; GFX940: ; %bb.0:
+; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT: v_add_u32_e32 v8, 0x400, v4
+; GFX940-NEXT: s_mov_b64 s[2:3], exec
+; GFX940-NEXT: .LBB28_1: ; =>This Inner Loop Header: Depth=1
+; GFX940-NEXT: v_readfirstlane_b32 s4, v0
+; GFX940-NEXT: v_readfirstlane_b32 s5, v1
+; GFX940-NEXT: v_readfirstlane_b32 s6, v2
+; GFX940-NEXT: v_readfirstlane_b32 s7, v3
+; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1]
+; GFX940-NEXT: s_nop 0
+; GFX940-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
+; GFX940-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
+; GFX940-NEXT: s_and_saveexec_b64 s[0:1], s[0:1]
+; GFX940-NEXT: buffer_load_dword v7, v4, s[4:7], 0 offen offset:1024
+; GFX940-NEXT: ; implicit-def: $vgpr4
+; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1]
+; GFX940-NEXT: s_cbranch_execnz .LBB28_1
+; GFX940-NEXT: ; %bb.2:
+; GFX940-NEXT: s_mov_b64 exec, s[2:3]
+; GFX940-NEXT: s_mov_b64 s[2:3], 0
+; GFX940-NEXT: v_lshlrev_b32_e32 v9, 16, v5
+; GFX940-NEXT: s_movk_i32 s10, 0x7fff
+; GFX940-NEXT: v_and_b32_e32 v10, 0xffff0000, v5
+; GFX940-NEXT: s_mov_b32 s11, 0x7060302
+; GFX940-NEXT: .LBB28_3: ; %atomicrmw.start
+; GFX940-NEXT: ; =>This Loop Header: Depth=1
+; GFX940-NEXT: ; Child Loop BB28_4 Depth 2
+; GFX940-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v7
+; GFX940-NEXT: v_add_f32_e32 v4, v4, v9
+; GFX940-NEXT: v_bfe_u32 v5, v4, 16, 1
+; GFX940-NEXT: v_add3_u32 v5, v5, v4, s10
+; GFX940-NEXT: v_or_b32_e32 v6, 0x400000, v4
+; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
+; GFX940-NEXT: s_mov_b64 s[8:9], exec
+; GFX940-NEXT: buffer_wbl2 sc1
+; GFX940-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc
+; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v7
+; GFX940-NEXT: v_add_f32_e32 v5, v5, v10
+; GFX940-NEXT: v_bfe_u32 v6, v5, 16, 1
+; GFX940-NEXT: v_add3_u32 v6, v6, v5, s10
+; GFX940-NEXT: v_or_b32_e32 v11, 0x400000, v5
+; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX940-NEXT: s_nop 1
+; GFX940-NEXT: v_cndmask_b32_e32 v5, v6, v11, vcc
+; GFX940-NEXT: v_perm_b32 v6, v5, v4, s11
+; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[6:7]
+; GFX940-NEXT: .LBB28_4: ; Parent Loop BB28_3 Depth=1
+; GFX940-NEXT: ; => This Inner Loop Header: Depth=2
+; GFX940-NEXT: v_readfirstlane_b32 s4, v0
+; GFX940-NEXT: v_readfirstlane_b32 s5, v1
+; GFX940-NEXT: v_readfirstlane_b32 s6, v2
+; GFX940-NEXT: v_readfirstlane_b32 s7, v3
+; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1]
+; GFX940-NEXT: s_nop 0
+; GFX940-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
+; GFX940-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
+; GFX940-NEXT: s_and_saveexec_b64 s[0:1], s[0:1]
+; GFX940-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[4:7], 0 offen sc0
+; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1]
+; GFX940-NEXT: s_cbranch_execnz .LBB28_4
+; GFX940-NEXT: ; %bb.5: ; in Loop: Header=BB28_3 Depth=1
+; GFX940-NEXT: s_mov_b64 exec, s[8:9]
+; GFX940-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7
+; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX940-NEXT: v_mov_b32_e32 v7, v4
+; GFX940-NEXT: buffer_inv sc1
+; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX940-NEXT: s_cbranch_execnz .LBB28_3
+; GFX940-NEXT: ; %bb.6: ; %atomicrmw.end
+; GFX940-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX940-NEXT: v_mov_b32_e32 v0, v4
+; GFX940-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_add_nc_u32_e32 v7, 0x400, v4
+; GFX11-NEXT: s_mov_b32 s1, 0
+; GFX11-NEXT: s_mov_b32 s2, exec_lo
+; GFX11-NEXT: .LBB28_1: ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT: v_readfirstlane_b32 s4, v0
+; GFX11-NEXT: v_readfirstlane_b32 s5, v1
+; GFX11-NEXT: v_readfirstlane_b32 s6, v2
+; GFX11-NEXT: v_readfirstlane_b32 s7, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
+; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX11-NEXT: s_and_saveexec_b32 s0, s0
+; GFX11-NEXT: buffer_load_b32 v6, v4, s[4:7], 0 offen offset:1024
+; GFX11-NEXT: ; implicit-def: $vgpr4
+; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT: s_cbranch_execnz .LBB28_1
+; GFX11-NEXT: ; %bb.2:
+; GFX11-NEXT: s_mov_b32 exec_lo, s2
+; GFX11-NEXT: v_lshlrev_b32_e32 v8, 16, v5
+; GFX11-NEXT: v_and_b32_e32 v9, 0xffff0000, v5
+; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX11-NEXT: .p2align 6
+; GFX11-NEXT: .LBB28_3: ; %atomicrmw.start
+; GFX11-NEXT: ; =>This Loop Header: Depth=1
+; GFX11-NEXT: ; Child Loop BB28_4 Depth 2
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v6
+; GFX11-NEXT: s_mov_b32 s2, exec_lo
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_dual_add_f32 v5, v5, v9 :: v_dual_add_f32 v4, v4, v8
+; GFX11-NEXT: v_bfe_u32 v11, v5, 16, 1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT: v_bfe_u32 v10, v4, 16, 1
+; GFX11-NEXT: v_or_b32_e32 v12, 0x400000, v4
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX11-NEXT: v_or_b32_e32 v13, 0x400000, v5
+; GFX11-NEXT: v_add3_u32 v11, v11, v5, 0x7fff
+; GFX11-NEXT: v_add3_u32 v10, v10, v4, 0x7fff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_cndmask_b32_e32 v4, v10, v12, vcc_lo
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-NEXT: v_cndmask_b32_e32 v5, v11, v13, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_perm_b32 v5, v5, v4, 0x7060302
+; GFX11-NEXT: v_mov_b32_e32 v4, v5
+; GFX11-NEXT: v_mov_b32_e32 v5, v6
+; GFX11-NEXT: .LBB28_4: ; Parent Loop BB28_3 Depth=1
+; GFX11-NEXT: ; => This Inner Loop Header: Depth=2
+; GFX11-NEXT: v_readfirstlane_b32 s4, v0
+; GFX11-NEXT: v_readfirstlane_b32 s5, v1
+; GFX11-NEXT: v_readfirstlane_b32 s6, v2
+; GFX11-NEXT: v_readfirstlane_b32 s7, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
+; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0
+; GFX11-NEXT: s_and_saveexec_b32 s0, s0
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], 0 offen glc
+; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT: s_cbranch_execnz .LBB28_4
+; GFX11-NEXT: ; %bb.5: ; in Loop: Header=BB28_3 Depth=1
+; GFX11-NEXT: s_mov_b32 exec_lo, s2
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6
+; GFX11-NEXT: v_mov_b32_e32 v6, v4
+; GFX11-NEXT: buffer_gl1_inv
+; GFX11-NEXT: buffer_gl0_inv
+; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX11-NEXT: s_cbranch_execnz .LBB28_3
+; GFX11-NEXT: ; %bb.6: ; %atomicrmw.end
+; GFX11-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX11-NEXT: v_mov_b32_e32 v0, v4
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_add_nc_u32_e32 v7, 0x400, v4
+; GFX10-NEXT: s_mov_b32 s5, 0
+; GFX10-NEXT: s_mov_b32 s6, exec_lo
+; GFX10-NEXT: .LBB28_1: ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT: v_readfirstlane_b32 s8, v0
+; GFX10-NEXT: v_readfirstlane_b32 s9, v1
+; GFX10-NEXT: v_readfirstlane_b32 s10, v2
+; GFX10-NEXT: v_readfirstlane_b32 s11, v3
+; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[8:9], v[0:1]
+; GFX10-NEXT: v_cmp_eq_u64_e64 s4, s[10:11], v[2:3]
+; GFX10-NEXT: s_and_b32 s4, vcc_lo, s4
+; GFX10-NEXT: s_and_saveexec_b32 s4, s4
+; GFX10-NEXT: buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024
+; GFX10-NEXT: ; implicit-def: $vgpr4
+; GFX10-NEXT: s_waitcnt_depctr 0xffe3
+; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT: s_cbranch_execnz .LBB28_1
+; GFX10-NEXT: ; %bb.2:
+; GFX10-NEXT: s_mov_b32 exec_lo, s6
+; GFX10-NEXT: v_lshlrev_b32_e32 v8, 16, v5
+; GFX10-NEXT: v_and_b32_e32 v9, 0xffff0000, v5
+; GFX10-NEXT: .LBB28_3: ; %atomicrmw.start
+; GFX10-NEXT: ; =>This Loop Header: Depth=1
+; GFX10-NEXT: ; Child Loop BB28_4 Depth 2
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v6
+; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX10-NEXT: s_mov_b32 s6, exec_lo
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: v_add_f32_e32 v4, v4, v8
+; GFX10-NEXT: v_add_f32_e32 v5, v5, v9
+; GFX10-NEXT: v_bfe_u32 v10, v4, 16, 1
+; GFX10-NEXT: v_bfe_u32 v11, v5, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v12, 0x400000, v4
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX10-NEXT: v_or_b32_e32 v13, 0x400000, v5
+; GFX10-NEXT: v_add3_u32 v10, v10, v4, 0x7fff
+; GFX10-NEXT: v_add3_u32 v11, v11, v5, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v4, v10, v12, vcc_lo
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX10-NEXT: v_cndmask_b32_e32 v5, v11, v13, vcc_lo
+; GFX10-NEXT: v_perm_b32 v5, v5, v4, 0x7060302
+; GFX10-NEXT: v_mov_b32_e32 v4, v5
+; GFX10-NEXT: v_mov_b32_e32 v5, v6
+; GFX10-NEXT: .LBB28_4: ; Parent Loop BB28_3 Depth=1
+; GFX10-NEXT: ; => This Inner Loop Header: Depth=2
+; GFX10-NEXT: v_readfirstlane_b32 s8, v0
+; GFX10-NEXT: v_readfirstlane_b32 s9, v1
+; GFX10-NEXT: v_readfirstlane_b32 s10, v2
+; GFX10-NEXT: v_readfirstlane_b32 s11, v3
+; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[8:9], v[0:1]
+; GFX10-NEXT: v_cmp_eq_u64_e64 s4, s[10:11], v[2:3]
+; GFX10-NEXT: s_and_b32 s4, vcc_lo, s4
+; GFX10-NEXT: s_and_saveexec_b32 s4, s4
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc
+; GFX10-NEXT: s_waitcnt_depctr 0xffe3
+; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT: s_cbranch_execnz .LBB28_4
+; GFX10-NEXT: ; %bb.5: ; in Loop: Header=BB28_3 Depth=1
+; GFX10-NEXT: s_mov_b32 exec_lo, s6
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6
+; GFX10-NEXT: v_mov_b32_e32 v6, v4
+; GFX10-NEXT: buffer_gl1_inv
+; GFX10-NEXT: buffer_gl0_inv
+; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5
+; GFX10-NEXT: s_waitcnt_depctr 0xffe3
+; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5
+; GFX10-NEXT: s_cbranch_execnz .LBB28_3
+; GFX10-NEXT: ; %bb.6: ; %atomicrmw.end
+; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5
+; GFX10-NEXT: v_mov_b32_e32 v0, v4
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
+; GFX90A: ; %bb.0:
+; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_add_u32_e32 v8, 0x400, v4
+; GFX90A-NEXT: s_mov_b64 s[6:7], exec
+; GFX90A-NEXT: .LBB28_1: ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT: v_readfirstlane_b32 s8, v0
+; GFX90A-NEXT: v_readfirstlane_b32 s9, v1
+; GFX90A-NEXT: v_readfirstlane_b32 s10, v2
+; GFX90A-NEXT: v_readfirstlane_b32 s11, v3
+; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
+; GFX90A-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
+; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
+; GFX90A-NEXT: s_nop 0
+; GFX90A-NEXT: buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024
+; GFX90A-NEXT: ; implicit-def: $vgpr4
+; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: s_cbranch_execnz .LBB28_1
+; GFX90A-NEXT: ; %bb.2:
+; GFX90A-NEXT: s_mov_b64 exec, s[6:7]
+; GFX90A-NEXT: s_mov_b64 s[6:7], 0
+; GFX90A-NEXT: v_lshlrev_b32_e32 v9, 16, v5
+; GFX90A-NEXT: s_movk_i32 s14, 0x7fff
+; GFX90A-NEXT: v_and_b32_e32 v10, 0xffff0000, v5
+; GFX90A-NEXT: s_mov_b32 s15, 0x7060302
+; GFX90A-NEXT: .LBB28_3: ; %atomicrmw.start
+; GFX90A-NEXT: ; =>This Loop Header: Depth=1
+; GFX90A-NEXT: ; Child Loop BB28_4 Depth 2
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v7
+; GFX90A-NEXT: v_add_f32_e32 v4, v4, v9
+; GFX90A-NEXT: v_bfe_u32 v5, v4, 16, 1
+; GFX90A-NEXT: v_add3_u32 v5, v5, v4, s14
+; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v4
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
+; GFX90A-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc
+; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7
+; GFX90A-NEXT: v_add_f32_e32 v5, v5, v10
+; GFX90A-NEXT: v_bfe_u32 v6, v5, 16, 1
+; GFX90A-NEXT: v_add3_u32 v6, v6, v5, s14
+; GFX90A-NEXT: v_or_b32_e32 v11, 0x400000, v5
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX90A-NEXT: v_cndmask_b32_e32 v5, v6, v11, vcc
+; GFX90A-NEXT: v_perm_b32 v6, v5, v4, s15
+; GFX90A-NEXT: s_mov_b64 s[12:13], exec
+; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1]
+; GFX90A-NEXT: .LBB28_4: ; Parent Loop BB28_3 Depth=1
+; GFX90A-NEXT: ; => This Inner Loop Header: Depth=2
+; GFX90A-NEXT: v_readfirstlane_b32 s8, v0
+; GFX90A-NEXT: v_readfirstlane_b32 s9, v1
+; GFX90A-NEXT: v_readfirstlane_b32 s10, v2
+; GFX90A-NEXT: v_readfirstlane_b32 s11, v3
+; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
+; GFX90A-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
+; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc
+; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: s_cbranch_execnz .LBB28_4
+; GFX90A-NEXT: ; %bb.5: ; in Loop: Header=BB28_3 Depth=1
+; GFX90A-NEXT: s_mov_b64 exec, s[12:13]
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7
+; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GFX90A-NEXT: v_mov_b32_e32 v7, v4
+; GFX90A-NEXT: buffer_wbinvl1
+; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
+; GFX90A-NEXT: s_cbranch_execnz .LBB28_3
+; GFX90A-NEXT: ; %bb.6: ; %atomicrmw.end
+; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v4
+; GFX90A-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
+; GFX908: ; %bb.0:
+; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT: v_add_u32_e32 v7, 0x400, v4
+; GFX908-NEXT: s_mov_b64 s[6:7], exec
+; GFX908-NEXT: .LBB28_1: ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT: v_readfirstlane_b32 s8, v0
+; GFX908-NEXT: v_readfirstlane_b32 s9, v1
+; GFX908-NEXT: v_readfirstlane_b32 s10, v2
+; GFX908-NEXT: v_readfirstlane_b32 s11, v3
+; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
+; GFX908-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
+; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
+; GFX908-NEXT: s_nop 0
+; GFX908-NEXT: buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024
+; GFX908-NEXT: ; implicit-def: $vgpr4
+; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5]
+; GFX908-NEXT: s_cbranch_execnz .LBB28_1
+; GFX908-NEXT: ; %bb.2:
+; GFX908-NEXT: s_mov_b64 exec, s[6:7]
+; GFX908-NEXT: s_mov_b64 s[6:7], 0
+; GFX908-NEXT: v_lshlrev_b32_e32 v8, 16, v5
+; GFX908-NEXT: s_movk_i32 s14, 0x7fff
+; GFX908-NEXT: v_and_b32_e32 v9, 0xffff0000, v5
+; GFX908-NEXT: s_mov_b32 s15, 0x7060302
+; GFX908-NEXT: .LBB28_3: ; %atomicrmw.start
+; GFX908-NEXT: ; =>This Loop Header: Depth=1
+; GFX908-NEXT: ; Child Loop BB28_4 Depth 2
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v6
+; GFX908-NEXT: v_add_f32_e32 v4, v4, v8
+; GFX908-NEXT: v_bfe_u32 v5, v4, 16, 1
+; GFX908-NEXT: v_add3_u32 v5, v5, v4, s14
+; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v4
+; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
+; GFX908-NEXT: v_cndmask_b32_e32 v4, v5, v10, vcc
+; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX908-NEXT: v_add_f32_e32 v5, v5, v9
+; GFX908-NEXT: v_bfe_u32 v10, v5, 16, 1
+; GFX908-NEXT: v_add3_u32 v10, v10, v5, s14
+; GFX908-NEXT: v_or_b32_e32 v11, 0x400000, v5
+; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX908-NEXT: v_cndmask_b32_e32 v5, v10, v11, vcc
+; GFX908-NEXT: v_perm_b32 v5, v5, v4, s15
+; GFX908-NEXT: v_mov_b32_e32 v4, v5
+; GFX908-NEXT: s_mov_b64 s[12:13], exec
+; GFX908-NEXT: v_mov_b32_e32 v5, v6
+; GFX908-NEXT: .LBB28_4: ; Parent Loop BB28_3 Depth=1
+; GFX908-NEXT: ; => This Inner Loop Header: Depth=2
+; GFX908-NEXT: v_readfirstlane_b32 s8, v0
+; GFX908-NEXT: v_readfirstlane_b32 s9, v1
+; GFX908-NEXT: v_readfirstlane_b32 s10, v2
+; GFX908-NEXT: v_readfirstlane_b32 s11, v3
+; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
+; GFX908-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
+; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc
+; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5]
+; GFX908-NEXT: s_cbranch_execnz .LBB28_4
+; GFX908-NEXT: ; %bb.5: ; in Loop: Header=BB28_3 Depth=1
+; GFX908-NEXT: s_mov_b64 exec, s[12:13]
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6
+; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GFX908-NEXT: v_mov_b32_e32 v6, v4
+; GFX908-NEXT: buffer_wbinvl1
+; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7]
+; GFX908-NEXT: s_cbranch_execnz .LBB28_3
+; GFX908-NEXT: ; %bb.6: ; %atomicrmw.end
+; GFX908-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX908-NEXT: v_mov_b32_e32 v0, v4
+; GFX908-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x400, v4
+; GFX8-NEXT: s_mov_b64 s[6:7], exec
+; GFX8-NEXT: .LBB28_1: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT: v_readfirstlane_b32 s8, v0
+; GFX8-NEXT: v_readfirstlane_b32 s9, v1
+; GFX8-NEXT: v_readfirstlane_b32 s10, v2
+; GFX8-NEXT: v_readfirstlane_b32 s11, v3
+; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
+; GFX8-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
+; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
+; GFX8-NEXT: s_nop 0
+; GFX8-NEXT: buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024
+; GFX8-NEXT: ; implicit-def: $vgpr4
+; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5]
+; GFX8-NEXT: s_cbranch_execnz .LBB28_1
+; GFX8-NEXT: ; %bb.2:
+; GFX8-NEXT: s_mov_b64 exec, s[6:7]
+; GFX8-NEXT: s_mov_b64 s[6:7], 0
+; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v5
+; GFX8-NEXT: v_and_b32_e32 v9, 0xffff0000, v5
+; GFX8-NEXT: .LBB28_3: ; %atomicrmw.start
+; GFX8-NEXT: ; =>This Loop Header: Depth=1
+; GFX8-NEXT: ; Child Loop BB28_4 Depth 2
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v6
+; GFX8-NEXT: v_add_f32_e32 v4, v4, v8
+; GFX8-NEXT: v_bfe_u32 v5, v4, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v4
+; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5
+; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v4
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
+; GFX8-NEXT: v_cndmask_b32_e32 v4, v5, v10, vcc
+; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX8-NEXT: v_add_f32_e32 v5, v5, v9
+; GFX8-NEXT: v_bfe_u32 v10, v5, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v10, vcc, v10, v5
+; GFX8-NEXT: v_add_u32_e32 v10, vcc, 0x7fff, v10
+; GFX8-NEXT: v_or_b32_e32 v11, 0x400000, v5
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX8-NEXT: v_cndmask_b32_e32 v5, v10, v11, vcc
+; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX8-NEXT: v_alignbit_b32 v5, v5, v4, 16
+; GFX8-NEXT: v_mov_b32_e32 v4, v5
+; GFX8-NEXT: s_mov_b64 s[12:13], exec
+; GFX8-NEXT: v_mov_b32_e32 v5, v6
+; GFX8-NEXT: .LBB28_4: ; Parent Loop BB28_3 Depth=1
+; GFX8-NEXT: ; => This Inner Loop Header: Depth=2
+; GFX8-NEXT: v_readfirstlane_b32 s8, v0
+; GFX8-NEXT: v_readfirstlane_b32 s9, v1
+; GFX8-NEXT: v_readfirstlane_b32 s10, v2
+; GFX8-NEXT: v_readfirstlane_b32 s11, v3
+; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
+; GFX8-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
+; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc
+; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5]
+; GFX8-NEXT: s_cbranch_execnz .LBB28_4
+; GFX8-NEXT: ; %bb.5: ; in Loop: Header=BB28_3 Depth=1
+; GFX8-NEXT: s_mov_b64 exec, s[12:13]
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6
+; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GFX8-NEXT: v_mov_b32_e32 v6, v4
+; GFX8-NEXT: buffer_wbinvl1
+; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7]
+; GFX8-NEXT: s_cbranch_execnz .LBB28_3
+; GFX8-NEXT: ; %bb.6: ; %atomicrmw.end
+; GFX8-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX8-NEXT: v_mov_b32_e32 v0, v4
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_add_i32_e32 v8, vcc, 0x400, v4
+; GFX7-NEXT: s_mov_b64 s[6:7], exec
+; GFX7-NEXT: .LBB28_1: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: v_readfirstlane_b32 s8, v0
+; GFX7-NEXT: v_readfirstlane_b32 s9, v1
+; GFX7-NEXT: v_readfirstlane_b32 s10, v2
+; GFX7-NEXT: v_readfirstlane_b32 s11, v3
+; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
+; GFX7-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
+; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
+; GFX7-NEXT: buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024
+; GFX7-NEXT: ; implicit-def: $vgpr4
+; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5]
+; GFX7-NEXT: s_cbranch_execnz .LBB28_1
+; GFX7-NEXT: ; %bb.2:
+; GFX7-NEXT: s_mov_b64 exec, s[6:7]
+; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
+; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v5
+; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v6
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v7
+; GFX7-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GFX7-NEXT: s_mov_b64 s[6:7], 0
+; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v5
+; GFX7-NEXT: .LBB28_3: ; %atomicrmw.start
+; GFX7-NEXT: ; =>This Loop Header: Depth=1
+; GFX7-NEXT: ; Child Loop BB28_4 Depth 2
+; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v7
+; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v4
+; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v7
+; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v5
+; GFX7-NEXT: v_add_f32_e32 v4, v4, v10
+; GFX7-NEXT: v_add_f32_e32 v6, v6, v9
+; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX7-NEXT: v_alignbit_b32 v4, v4, v6, 16
+; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v7
+; GFX7-NEXT: v_alignbit_b32 v5, v6, v5, 16
+; GFX7-NEXT: v_mov_b32_e32 v7, v5
+; GFX7-NEXT: s_mov_b64 s[12:13], exec
+; GFX7-NEXT: v_mov_b32_e32 v6, v4
+; GFX7-NEXT: .LBB28_4: ; Parent Loop BB28_3 Depth=1
+; GFX7-NEXT: ; => This Inner Loop Header: Depth=2
+; GFX7-NEXT: v_readfirstlane_b32 s8, v0
+; GFX7-NEXT: v_readfirstlane_b32 s9, v1
+; GFX7-NEXT: v_readfirstlane_b32 s10, v2
+; GFX7-NEXT: v_readfirstlane_b32 s11, v3
+; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
+; GFX7-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
+; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v8, s[8:11], 0 offen glc
+; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5]
+; GFX7-NEXT: s_cbranch_execnz .LBB28_4
+; GFX7-NEXT: ; %bb.5: ; in Loop: Header=BB28_3 Depth=1
+; GFX7-NEXT: s_mov_b64 exec, s[12:13]
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5
+; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v6
+; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GFX7-NEXT: v_lshlrev_b32_e32 v7, 16, v6
+; GFX7-NEXT: buffer_wbinvl1
+; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7]
+; GFX7-NEXT: s_cbranch_execnz .LBB28_3
+; GFX7-NEXT: ; %bb.6: ; %atomicrmw.end
+; GFX7-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v0, v7
+; GFX7-NEXT: v_mov_b32_e32 v1, v4
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT: v_add_i32_e32 v8, vcc, 0x400, v4
+; GFX6-NEXT: s_mov_b64 s[6:7], exec
+; GFX6-NEXT: .LBB28_1: ; =>This Inner Loop Header: Depth=1
+; GFX6-NEXT: v_readfirstlane_b32 s8, v0
+; GFX6-NEXT: v_readfirstlane_b32 s9, v1
+; GFX6-NEXT: v_readfirstlane_b32 s10, v2
+; GFX6-NEXT: v_readfirstlane_b32 s11, v3
+; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
+; GFX6-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
+; GFX6-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
+; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
+; GFX6-NEXT: buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024
+; GFX6-NEXT: ; implicit-def: $vgpr4
+; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5]
+; GFX6-NEXT: s_cbranch_execnz .LBB28_1
+; GFX6-NEXT: ; %bb.2:
+; GFX6-NEXT: s_mov_b64 exec, s[6:7]
+; GFX6-NEXT: v_mul_f32_e32 v5, 1.0, v5
+; GFX6-NEXT: v_and_b32_e32 v9, 0xffff0000, v5
+; GFX6-NEXT: v_mul_f32_e32 v5, 1.0, v6
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v7
+; GFX6-NEXT: v_lshlrev_b32_e32 v7, 16, v7
+; GFX6-NEXT: s_mov_b64 s[6:7], 0
+; GFX6-NEXT: v_and_b32_e32 v10, 0xffff0000, v5
+; GFX6-NEXT: .LBB28_3: ; %atomicrmw.start
+; GFX6-NEXT: ; =>This Loop Header: Depth=1
+; GFX6-NEXT: ; Child Loop BB28_4 Depth 2
+; GFX6-NEXT: v_mul_f32_e32 v5, 1.0, v7
+; GFX6-NEXT: v_mul_f32_e32 v7, 1.0, v4
+; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v7
+; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v5
+; GFX6-NEXT: v_add_f32_e32 v4, v4, v10
+; GFX6-NEXT: v_add_f32_e32 v6, v6, v9
+; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX6-NEXT: v_alignbit_b32 v4, v4, v6, 16
+; GFX6-NEXT: v_lshrrev_b32_e32 v6, 16, v7
+; GFX6-NEXT: v_alignbit_b32 v5, v6, v5, 16
+; GFX6-NEXT: v_mov_b32_e32 v7, v5
+; GFX6-NEXT: s_mov_b64 s[12:13], exec
+; GFX6-NEXT: v_mov_b32_e32 v6, v4
+; GFX6-NEXT: .LBB28_4: ; Parent Loop BB28_3 Depth=1
+; GFX6-NEXT: ; => This Inner Loop Header: Depth=2
+; GFX6-NEXT: v_readfirstlane_b32 s8, v0
+; GFX6-NEXT: v_readfirstlane_b32 s9, v1
+; GFX6-NEXT: v_readfirstlane_b32 s10, v2
+; GFX6-NEXT: v_readfirstlane_b32 s11, v3
+; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
+; GFX6-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
+; GFX6-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
+; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v8, s[8:11], 0 offen glc
+; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5]
+; GFX6-NEXT: s_cbranch_execnz .LBB28_4
+; GFX6-NEXT: ; %bb.5: ; in Loop: Header=BB28_3 Depth=1
+; GFX6-NEXT: s_mov_b64 exec, s[12:13]
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5
+; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v6
+; GFX6-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_lshlrev_b32_e32 v7, 16, v6
+; GFX6-NEXT: buffer_wbinvl1
+; GFX6-NEXT: s_andn2_b64 exec, exec, s[6:7]
+; GFX6-NEXT: s_cbranch_execnz .LBB28_3
+; GFX6-NEXT: ; %bb.6: ; %atomicrmw.end
+; GFX6-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v0, v7
+; GFX6-NEXT: v_mov_b32_e32 v1, v4
+; GFX6-NEXT: s_setpc_b64 s[30:31]
+ %gep = getelementptr <2 x bfloat>, ptr addrspace(7) %ptr, i32 256
+ %result = atomicrmw fadd ptr addrspace(7) %gep, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+ ret <2 x bfloat> %result
+}
+
+define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset(ptr addrspace(7) inreg %ptr, <2 x bfloat> %val) {
+; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_mov_b32_e32 v1, s4
+; GFX12-NEXT: s_wait_storecnt 0x0
+; GFX12-NEXT: buffer_atomic_pk_add_bf16 v0, v1, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset:
+; GFX940: ; %bb.0:
+; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT: v_mov_b32_e32 v1, v0
+; GFX940-NEXT: v_mov_b32_e32 v0, s4
+; GFX940-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024
+; GFX940-NEXT: s_addk_i32 s4, 0x400
+; GFX940-NEXT: s_mov_b64 s[6:7], 0
+; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v1
+; GFX940-NEXT: s_movk_i32 s8, 0x7fff
+; GFX940-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
+; GFX940-NEXT: s_mov_b32 s9, 0x7060302
+; GFX940-NEXT: v_mov_b32_e32 v4, s4
+; GFX940-NEXT: .LBB29_1: ; %atomicrmw.start
+; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX940-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NEXT: v_mov_b32_e32 v7, v0
+; GFX940-NEXT: v_lshlrev_b32_e32 v0, 16, v7
+; GFX940-NEXT: v_and_b32_e32 v1, 0xffff0000, v7
+; GFX940-NEXT: v_add_f32_e32 v0, v0, v2
+; GFX940-NEXT: v_add_f32_e32 v1, v1, v3
+; GFX940-NEXT: v_bfe_u32 v5, v0, 16, 1
+; GFX940-NEXT: v_bfe_u32 v8, v1, 16, 1
+; GFX940-NEXT: v_or_b32_e32 v6, 0x400000, v0
+; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v1
+; GFX940-NEXT: v_add3_u32 v5, v5, v0, s8
+; GFX940-NEXT: v_add3_u32 v8, v8, v1, s8
+; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
+; GFX940-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
+; GFX940-NEXT: buffer_wbl2 sc1
+; GFX940-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc
+; GFX940-NEXT: v_cndmask_b32_e64 v0, v5, v6, s[4:5]
+; GFX940-NEXT: v_perm_b32 v6, v1, v0, s9
+; GFX940-NEXT: v_mov_b64_e32 v[0:1], v[6:7]
+; GFX940-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[0:3], 0 offen sc0
+; GFX940-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NEXT: buffer_inv sc1
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v7
+; GFX940-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GFX940-NEXT: s_andn2_b64 exec, exec, s[6:7]
+; GFX940-NEXT: s_cbranch_execnz .LBB29_1
+; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX940-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX940-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s4
+; GFX11-NEXT: s_addk_i32 s4, 0x400
+; GFX11-NEXT: s_mov_b32 s5, 0
+; GFX11-NEXT: v_mov_b32_e32 v4, s4
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v1
+; GFX11-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024
+; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
+; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX11-NEXT: .p2align 6
+; GFX11-NEXT: .LBB29_1: ; %atomicrmw.start
+; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_mov_b32_e32 v6, v0
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v6
+; GFX11-NEXT: v_add_f32_e32 v1, v1, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_bfe_u32 v7, v1, 16, 1
+; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v1
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-NEXT: v_add3_u32 v7, v7, v1, 0x7fff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_dual_cndmask_b32 v1, v7, v9 :: v_dual_lshlrev_b32 v0, 16, v6
+; GFX11-NEXT: v_add_f32_e32 v0, v0, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_bfe_u32 v5, v0, 16, 1
+; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0
+; GFX11-NEXT: v_cmp_u_f32_e64 s4, v0, v0
+; GFX11-NEXT: v_add3_u32 v5, v5, v0, 0x7fff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_cndmask_b32_e64 v0, v5, v8, s4
+; GFX11-NEXT: v_perm_b32 v5, v1, v0, 0x7060302
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_dual_mov_b32 v0, v5 :: v_dual_mov_b32 v1, v6
+; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v4, s[0:3], 0 offen glc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: buffer_gl1_inv
+; GFX11-NEXT: buffer_gl0_inv
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6
+; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
+; GFX11-NEXT: s_cbranch_execnz .LBB29_1
+; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s5
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_mov_b32_e32 v1, v0
+; GFX10-NEXT: v_mov_b32_e32 v0, s8
+; GFX10-NEXT: s_addk_i32 s8, 0x400
+; GFX10-NEXT: s_mov_b32 s9, 0
+; GFX10-NEXT: v_mov_b32_e32 v4, s8
+; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v1
+; GFX10-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024
+; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
+; GFX10-NEXT: .LBB29_1: ; %atomicrmw.start
+; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_mov_b32_e32 v6, v0
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v6
+; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v6
+; GFX10-NEXT: v_add_f32_e32 v0, v0, v2
+; GFX10-NEXT: v_add_f32_e32 v1, v1, v3
+; GFX10-NEXT: v_bfe_u32 v5, v0, 16, 1
+; GFX10-NEXT: v_bfe_u32 v7, v1, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v0
+; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v1
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX10-NEXT: v_add3_u32 v5, v5, v0, 0x7fff
+; GFX10-NEXT: v_add3_u32 v7, v7, v1, 0x7fff
+; GFX10-NEXT: v_cmp_u_f32_e64 s8, v0, v0
+; GFX10-NEXT: v_cndmask_b32_e32 v1, v7, v9, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e64 v0, v5, v8, s8
+; GFX10-NEXT: v_perm_b32 v5, v1, v0, 0x7060302
+; GFX10-NEXT: v_mov_b32_e32 v0, v5
+; GFX10-NEXT: v_mov_b32_e32 v1, v6
+; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[4:7], 0 offen glc
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: buffer_gl1_inv
+; GFX10-NEXT: buffer_gl0_inv
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6
+; GFX10-NEXT: s_or_b32 s9, vcc_lo, s9
+; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s9
+; GFX10-NEXT: s_cbranch_execnz .LBB29_1
+; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s9
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset:
+; GFX90A: ; %bb.0:
+; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v1, v0
+; GFX90A-NEXT: v_mov_b32_e32 v0, s8
+; GFX90A-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024
+; GFX90A-NEXT: s_addk_i32 s8, 0x400
+; GFX90A-NEXT: s_mov_b64 s[10:11], 0
+; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v1
+; GFX90A-NEXT: s_movk_i32 s12, 0x7fff
+; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
+; GFX90A-NEXT: s_mov_b32 s13, 0x7060302
+; GFX90A-NEXT: v_mov_b32_e32 v4, s8
+; GFX90A-NEXT: .LBB29_1: ; %atomicrmw.start
+; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v7, v0
+; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v7
+; GFX90A-NEXT: v_and_b32_e32 v1, 0xffff0000, v7
+; GFX90A-NEXT: v_add_f32_e32 v0, v0, v2
+; GFX90A-NEXT: v_add_f32_e32 v1, v1, v3
+; GFX90A-NEXT: v_bfe_u32 v5, v0, 16, 1
+; GFX90A-NEXT: v_bfe_u32 v8, v1, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v0
+; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v1
+; GFX90A-NEXT: v_add3_u32 v5, v5, v0, s12
+; GFX90A-NEXT: v_add3_u32 v8, v8, v1, s12
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
+; GFX90A-NEXT: v_cmp_u_f32_e64 s[8:9], v0, v0
+; GFX90A-NEXT: v_cndmask_b32_e64 v0, v5, v6, s[8:9]
+; GFX90A-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc
+; GFX90A-NEXT: v_perm_b32 v6, v1, v0, s13
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[6:7] op_sel:[0,1]
+; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[4:7], 0 offen glc
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: buffer_wbinvl1
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v7
+; GFX90A-NEXT: s_or_b64 s[10:11], vcc, s[10:11]
+; GFX90A-NEXT: s_andn2_b64 exec, exec, s[10:11]
+; GFX90A-NEXT: s_cbranch_execnz .LBB29_1
+; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX90A-NEXT: s_or_b64 exec, exec, s[10:11]
+; GFX90A-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset:
+; GFX908: ; %bb.0:
+; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT: v_mov_b32_e32 v1, v0
+; GFX908-NEXT: v_mov_b32_e32 v0, s8
+; GFX908-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024
+; GFX908-NEXT: s_addk_i32 s8, 0x400
+; GFX908-NEXT: s_mov_b64 s[10:11], 0
+; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v1
+; GFX908-NEXT: s_movk_i32 s12, 0x7fff
+; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
+; GFX908-NEXT: s_mov_b32 s13, 0x7060302
+; GFX908-NEXT: v_mov_b32_e32 v4, s8
+; GFX908-NEXT: .LBB29_1: ; %atomicrmw.start
+; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: v_mov_b32_e32 v6, v0
+; GFX908-NEXT: v_lshlrev_b32_e32 v0, 16, v6
+; GFX908-NEXT: v_and_b32_e32 v1, 0xffff0000, v6
+; GFX908-NEXT: v_add_f32_e32 v0, v0, v2
+; GFX908-NEXT: v_add_f32_e32 v1, v1, v3
+; GFX908-NEXT: v_bfe_u32 v5, v0, 16, 1
+; GFX908-NEXT: v_bfe_u32 v8, v1, 16, 1
+; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v0
+; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v1
+; GFX908-NEXT: v_add3_u32 v5, v5, v0, s12
+; GFX908-NEXT: v_add3_u32 v8, v8, v1, s12
+; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
+; GFX908-NEXT: v_cmp_u_f32_e64 s[8:9], v0, v0
+; GFX908-NEXT: v_cndmask_b32_e64 v0, v5, v7, s[8:9]
+; GFX908-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc
+; GFX908-NEXT: v_perm_b32 v5, v1, v0, s13
+; GFX908-NEXT: v_mov_b32_e32 v0, v5
+; GFX908-NEXT: v_mov_b32_e32 v1, v6
+; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[4:7], 0 offen glc
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: buffer_wbinvl1
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6
+; GFX908-NEXT: s_or_b64 s[10:11], vcc, s[10:11]
+; GFX908-NEXT: s_andn2_b64 exec, exec, s[10:11]
+; GFX908-NEXT: s_cbranch_execnz .LBB29_1
+; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT: s_or_b64 exec, exec, s[10:11]
+; GFX908-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v1, v0
+; GFX8-NEXT: v_mov_b32_e32 v0, s8
+; GFX8-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024
+; GFX8-NEXT: s_addk_i32 s8, 0x400
+; GFX8-NEXT: s_mov_b64 s[10:11], 0
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v1
+; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
+; GFX8-NEXT: v_mov_b32_e32 v4, s8
+; GFX8-NEXT: .LBB29_1: ; %atomicrmw.start
+; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v6, v0
+; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v6
+; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v6
+; GFX8-NEXT: v_add_f32_e32 v0, v0, v2
+; GFX8-NEXT: v_add_f32_e32 v1, v1, v3
+; GFX8-NEXT: v_bfe_u32 v5, v0, 16, 1
+; GFX8-NEXT: v_bfe_u32 v8, v1, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v0
+; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v1
+; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5
+; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8
+; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v1
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
+; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v0
+; GFX8-NEXT: v_cmp_u_f32_e64 s[8:9], v0, v0
+; GFX8-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v0, v5, v7, s[8:9]
+; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX8-NEXT: v_alignbit_b32 v5, v1, v0, 16
+; GFX8-NEXT: v_mov_b32_e32 v0, v5
+; GFX8-NEXT: v_mov_b32_e32 v1, v6
+; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[4:7], 0 offen glc
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: buffer_wbinvl1
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6
+; GFX8-NEXT: s_or_b64 s[10:11], vcc, s[10:11]
+; GFX8-NEXT: s_andn2_b64 exec, exec, s[10:11]
+; GFX8-NEXT: s_cbranch_execnz .LBB29_1
+; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT: s_or_b64 exec, exec, s[10:11]
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v2, s8
+; GFX7-NEXT: buffer_load_dword v4, v2, s[4:7], 0 offen offset:1024
+; GFX7-NEXT: s_addk_i32 s8, 0x400
+; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GFX7-NEXT: s_mov_b64 s[10:11], 0
+; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v0
+; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v4
+; GFX7-NEXT: v_mov_b32_e32 v4, s8
+; GFX7-NEXT: .LBB29_1: ; %atomicrmw.start
+; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v1
+; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_add_f32_e32 v6, v6, v3
+; GFX7-NEXT: v_add_f32_e32 v5, v5, v2
+; GFX7-NEXT: v_alignbit_b32 v1, v1, v0, 16
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v6
+; GFX7-NEXT: v_alignbit_b32 v0, v0, v5, 16
+; GFX7-NEXT: v_mov_b32_e32 v6, v1
+; GFX7-NEXT: v_mov_b32_e32 v5, v0
+; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[4:7], 0 offen glc
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: buffer_wbinvl1
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v5
+; GFX7-NEXT: s_or_b64 s[10:11], vcc, s[10:11]
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v5
+; GFX7-NEXT: s_andn2_b64 exec, exec, s[10:11]
+; GFX7-NEXT: s_cbranch_execnz .LBB29_1
+; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT: s_or_b64 exec, exec, s[10:11]
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v2, s8
+; GFX6-NEXT: buffer_load_dword v4, v2, s[4:7], 0 offen offset:1024
+; GFX6-NEXT: s_addk_i32 s8, 0x400
+; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GFX6-NEXT: s_mov_b64 s[10:11], 0
+; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v0
+; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v4
+; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v4
+; GFX6-NEXT: v_mov_b32_e32 v4, s8
+; GFX6-NEXT: .LBB29_1: ; %atomicrmw.start
+; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v1
+; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v0
+; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX6-NEXT: v_add_f32_e32 v6, v6, v3
+; GFX6-NEXT: v_add_f32_e32 v5, v5, v2
+; GFX6-NEXT: v_alignbit_b32 v1, v1, v0, 16
+; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v6
+; GFX6-NEXT: v_alignbit_b32 v0, v0, v5, 16
+; GFX6-NEXT: v_mov_b32_e32 v6, v1
+; GFX6-NEXT: v_mov_b32_e32 v5, v0
+; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[4:7], 0 offen glc
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: buffer_wbinvl1
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1
+; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v5
+; GFX6-NEXT: s_or_b64 s[10:11], vcc, s[10:11]
+; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v5
+; GFX6-NEXT: s_andn2_b64 exec, exec, s[10:11]
+; GFX6-NEXT: s_cbranch_execnz .LBB29_1
+; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX6-NEXT: s_or_b64 exec, exec, s[10:11]
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: s_setpc_b64 s[30:31]
+ %gep = getelementptr <2 x bfloat>, ptr addrspace(7) %ptr, i32 256
+ %result = atomicrmw fadd ptr addrspace(7) %gep, <2 x bfloat> %val syncscope("agent") seq_cst
+ ret <2 x bfloat> %result
+}
+
+define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset(ptr addrspace(7) inreg %ptr, <2 x bfloat> %val) {
+; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_mov_b32_e32 v1, s4
+; GFX12-NEXT: s_wait_storecnt 0x0
+; GFX12-NEXT: buffer_atomic_pk_add_bf16 v0, v1, s[0:3], null offen offset:1024
+; GFX12-NEXT: s_wait_storecnt 0x0
+; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset:
+; GFX940: ; %bb.0:
+; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen offset:1024
+; GFX940-NEXT: s_addk_i32 s4, 0x400
+; GFX940-NEXT: s_mov_b64 s[6:7], 0
+; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v0
+; GFX940-NEXT: s_movk_i32 s8, 0x7fff
+; GFX940-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
+; GFX940-NEXT: s_mov_b32 s9, 0x7060302
+; GFX940-NEXT: v_mov_b32_e32 v4, s4
+; GFX940-NEXT: .LBB30_1: ; %atomicrmw.start
+; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX940-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NEXT: v_lshlrev_b32_e32 v0, 16, v1
+; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
+; GFX940-NEXT: v_add_f32_e32 v0, v0, v2
+; GFX940-NEXT: v_add_f32_e32 v5, v5, v3
+; GFX940-NEXT: v_bfe_u32 v6, v0, 16, 1
+; GFX940-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX940-NEXT: v_or_b32_e32 v7, 0x400000, v0
+; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX940-NEXT: v_add3_u32 v6, v6, v0, s8
+; GFX940-NEXT: v_add3_u32 v8, v8, v5, s8
+; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX940-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
+; GFX940-NEXT: buffer_wbl2 sc1
+; GFX940-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX940-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5]
+; GFX940-NEXT: v_perm_b32 v0, v5, v0, s9
+; GFX940-NEXT: v_mov_b64_e32 v[6:7], v[0:1]
+; GFX940-NEXT: buffer_atomic_cmpswap v[6:7], v4, s[0:3], 0 offen sc0
+; GFX940-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NEXT: buffer_inv sc1
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v6, v1
+; GFX940-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GFX940-NEXT: v_mov_b32_e32 v1, v6
+; GFX940-NEXT: s_andn2_b64 exec, exec, s[6:7]
+; GFX940-NEXT: s_cbranch_execnz .LBB30_1
+; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX940-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX940-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_dual_mov_b32 v1, s4 :: v_dual_lshlrev_b32 v2, 16, v0
+; GFX11-NEXT: s_addk_i32 s4, 0x400
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v0
+; GFX11-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024
+; GFX11-NEXT: s_mov_b32 s5, 0
+; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX11-NEXT: .p2align 6
+; GFX11-NEXT: .LBB30_1: ; %atomicrmw.start
+; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
+; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v1
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_dual_add_f32 v5, v5, v3 :: v_dual_add_f32 v0, v0, v2
+; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT: v_bfe_u32 v6, v0, 16, 1
+; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0
+; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-NEXT: v_add3_u32 v6, v6, v0, 0x7fff
+; GFX11-NEXT: v_cmp_u_f32_e64 s4, v0, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX11-NEXT: v_cndmask_b32_e64 v0, v6, v8, s4
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_perm_b32 v0, v5, v0, 0x7060302
+; GFX11-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v5, v0
+; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], 0 offen glc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: buffer_gl1_inv
+; GFX11-NEXT: buffer_gl0_inv
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v1
+; GFX11-NEXT: v_mov_b32_e32 v1, v5
+; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
+; GFX11-NEXT: s_cbranch_execnz .LBB30_1
+; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s5
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_mov_b32_e32 v1, s8
+; GFX10-NEXT: s_addk_i32 s8, 0x400
+; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v0
+; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
+; GFX10-NEXT: v_mov_b32_e32 v4, s8
+; GFX10-NEXT: buffer_load_dword v1, v1, s[4:7], 0 offen offset:1024
+; GFX10-NEXT: s_mov_b32 s9, 0
+; GFX10-NEXT: .LBB30_1: ; %atomicrmw.start
+; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v1
+; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: v_add_f32_e32 v0, v0, v2
+; GFX10-NEXT: v_add_f32_e32 v5, v5, v3
+; GFX10-NEXT: v_bfe_u32 v6, v0, 16, 1
+; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v0
+; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX10-NEXT: v_add3_u32 v6, v6, v0, 0x7fff
+; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX10-NEXT: v_cmp_u_f32_e64 s8, v0, v0
+; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e64 v0, v6, v8, s8
+; GFX10-NEXT: v_perm_b32 v0, v5, v0, 0x7060302
+; GFX10-NEXT: v_mov_b32_e32 v6, v1
+; GFX10-NEXT: v_mov_b32_e32 v5, v0
+; GFX10-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[4:7], 0 offen glc
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: buffer_gl1_inv
+; GFX10-NEXT: buffer_gl0_inv
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v1
+; GFX10-NEXT: v_mov_b32_e32 v1, v5
+; GFX10-NEXT: s_or_b32 s9, vcc_lo, s9
+; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s9
+; GFX10-NEXT: s_cbranch_execnz .LBB30_1
+; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s9
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset:
+; GFX90A: ; %bb.0:
+; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v1, s8
+; GFX90A-NEXT: buffer_load_dword v1, v1, s[4:7], 0 offen offset:1024
+; GFX90A-NEXT: s_addk_i32 s8, 0x400
+; GFX90A-NEXT: s_mov_b64 s[10:11], 0
+; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v0
+; GFX90A-NEXT: s_movk_i32 s12, 0x7fff
+; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
+; GFX90A-NEXT: s_mov_b32 s13, 0x7060302
+; GFX90A-NEXT: v_mov_b32_e32 v4, s8
+; GFX90A-NEXT: .LBB30_1: ; %atomicrmw.start
+; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v1
+; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
+; GFX90A-NEXT: v_add_f32_e32 v0, v0, v2
+; GFX90A-NEXT: v_add_f32_e32 v5, v5, v3
+; GFX90A-NEXT: v_bfe_u32 v6, v0, 16, 1
+; GFX90A-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v0
+; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX90A-NEXT: v_add3_u32 v6, v6, v0, s12
+; GFX90A-NEXT: v_add3_u32 v8, v8, v5, s12
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX90A-NEXT: v_cmp_u_f32_e64 s[8:9], v0, v0
+; GFX90A-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[8:9]
+; GFX90A-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX90A-NEXT: v_perm_b32 v0, v5, v0, s13
+; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[0:1], v[0:1] op_sel:[0,1]
+; GFX90A-NEXT: buffer_atomic_cmpswap v[6:7], v4, s[4:7], 0 offen glc
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: buffer_wbinvl1
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v6, v1
+; GFX90A-NEXT: s_or_b64 s[10:11], vcc, s[10:11]
+; GFX90A-NEXT: v_mov_b32_e32 v1, v6
+; GFX90A-NEXT: s_andn2_b64 exec, exec, s[10:11]
+; GFX90A-NEXT: s_cbranch_execnz .LBB30_1
+; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX90A-NEXT: s_or_b64 exec, exec, s[10:11]
+; GFX90A-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset:
+; GFX908: ; %bb.0:
+; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT: v_mov_b32_e32 v1, s8
+; GFX908-NEXT: buffer_load_dword v1, v1, s[4:7], 0 offen offset:1024
+; GFX908-NEXT: s_addk_i32 s8, 0x400
+; GFX908-NEXT: s_mov_b64 s[10:11], 0
+; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v0
+; GFX908-NEXT: s_movk_i32 s12, 0x7fff
+; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
+; GFX908-NEXT: s_mov_b32 s13, 0x7060302
+; GFX908-NEXT: v_mov_b32_e32 v4, s8
+; GFX908-NEXT: .LBB30_1: ; %atomicrmw.start
+; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: v_lshlrev_b32_e32 v0, 16, v1
+; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
+; GFX908-NEXT: v_add_f32_e32 v0, v0, v2
+; GFX908-NEXT: v_add_f32_e32 v5, v5, v3
+; GFX908-NEXT: v_bfe_u32 v6, v0, 16, 1
+; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v0
+; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX908-NEXT: v_add3_u32 v6, v6, v0, s12
+; GFX908-NEXT: v_add3_u32 v8, v8, v5, s12
+; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX908-NEXT: v_cmp_u_f32_e64 s[8:9], v0, v0
+; GFX908-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[8:9]
+; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX908-NEXT: v_perm_b32 v0, v5, v0, s13
+; GFX908-NEXT: v_mov_b32_e32 v6, v1
+; GFX908-NEXT: v_mov_b32_e32 v5, v0
+; GFX908-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[4:7], 0 offen glc
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: buffer_wbinvl1
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1
+; GFX908-NEXT: s_or_b64 s[10:11], vcc, s[10:11]
+; GFX908-NEXT: v_mov_b32_e32 v1, v5
+; GFX908-NEXT: s_andn2_b64 exec, exec, s[10:11]
+; GFX908-NEXT: s_cbranch_execnz .LBB30_1
+; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT: s_or_b64 exec, exec, s[10:11]
+; GFX908-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v1, s8
+; GFX8-NEXT: buffer_load_dword v1, v1, s[4:7], 0 offen offset:1024
+; GFX8-NEXT: s_addk_i32 s8, 0x400
+; GFX8-NEXT: s_mov_b64 s[10:11], 0
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v0
+; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
+; GFX8-NEXT: v_mov_b32_e32 v4, s8
+; GFX8-NEXT: .LBB30_1: ; %atomicrmw.start
+; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v1
+; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
+; GFX8-NEXT: v_add_f32_e32 v0, v0, v2
+; GFX8-NEXT: v_add_f32_e32 v5, v5, v3
+; GFX8-NEXT: v_bfe_u32 v6, v0, 16, 1
+; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v0
+; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5
+; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6
+; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8
+; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v0
+; GFX8-NEXT: v_cmp_u_f32_e64 s[8:9], v0, v0
+; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[8:9]
+; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX8-NEXT: v_alignbit_b32 v0, v5, v0, 16
+; GFX8-NEXT: v_mov_b32_e32 v6, v1
+; GFX8-NEXT: v_mov_b32_e32 v5, v0
+; GFX8-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[4:7], 0 offen glc
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: buffer_wbinvl1
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1
+; GFX8-NEXT: s_or_b64 s[10:11], vcc, s[10:11]
+; GFX8-NEXT: v_mov_b32_e32 v1, v5
+; GFX8-NEXT: s_andn2_b64 exec, exec, s[10:11]
+; GFX8-NEXT: s_cbranch_execnz .LBB30_1
+; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT: s_or_b64 exec, exec, s[10:11]
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v2, s8
+; GFX7-NEXT: buffer_load_dword v2, v2, s[4:7], 0 offen offset:1024
+; GFX7-NEXT: s_addk_i32 s8, 0x400
+; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GFX7-NEXT: s_mov_b64 s[10:11], 0
+; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
+; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX7-NEXT: v_mov_b32_e32 v2, s8
+; GFX7-NEXT: .LBB30_1: ; %atomicrmw.start
+; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_add_f32_e32 v6, v6, v1
+; GFX7-NEXT: v_add_f32_e32 v5, v5, v0
+; GFX7-NEXT: v_alignbit_b32 v4, v3, v4, 16
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v6
+; GFX7-NEXT: v_alignbit_b32 v3, v3, v5, 16
+; GFX7-NEXT: v_mov_b32_e32 v6, v4
+; GFX7-NEXT: v_mov_b32_e32 v5, v3
+; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v2, s[4:7], 0 offen glc
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: buffer_wbinvl1
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4
+; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v5
+; GFX7-NEXT: s_or_b64 s[10:11], vcc, s[10:11]
+; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v5
+; GFX7-NEXT: s_andn2_b64 exec, exec, s[10:11]
+; GFX7-NEXT: s_cbranch_execnz .LBB30_1
+; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT: s_or_b64 exec, exec, s[10:11]
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v2, s8
+; GFX6-NEXT: buffer_load_dword v2, v2, s[4:7], 0 offen offset:1024
+; GFX6-NEXT: s_addk_i32 s8, 0x400
+; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GFX6-NEXT: s_mov_b64 s[10:11], 0
+; GFX6-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
+; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX6-NEXT: v_mov_b32_e32 v2, s8
+; GFX6-NEXT: .LBB30_1: ; %atomicrmw.start
+; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
+; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX6-NEXT: v_add_f32_e32 v6, v6, v1
+; GFX6-NEXT: v_add_f32_e32 v5, v5, v0
+; GFX6-NEXT: v_alignbit_b32 v4, v3, v4, 16
+; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v6
+; GFX6-NEXT: v_alignbit_b32 v3, v3, v5, 16
+; GFX6-NEXT: v_mov_b32_e32 v6, v4
+; GFX6-NEXT: v_mov_b32_e32 v5, v3
+; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v2, s[4:7], 0 offen glc
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: buffer_wbinvl1
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4
+; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v5
+; GFX6-NEXT: s_or_b64 s[10:11], vcc, s[10:11]
+; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v5
+; GFX6-NEXT: s_andn2_b64 exec, exec, s[10:11]
+; GFX6-NEXT: s_cbranch_execnz .LBB30_1
+; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX6-NEXT: s_or_b64 exec, exec, s[10:11]
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: s_setpc_b64 s[30:31]
+ %gep = getelementptr <2 x bfloat>, ptr addrspace(7) %ptr, i32 256
+ %unused = atomicrmw fadd ptr addrspace(7) %gep, <2 x bfloat> %val syncscope("agent") seq_cst
+ ret void
+}
+
+define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu_no_remote_memory(ptr addrspace(7) inreg %ptr, <2 x bfloat> %val) #0 {
+; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu_no_remote_memory:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_mov_b32_e32 v1, s4
+; GFX12-NEXT: s_wait_storecnt 0x0
+; GFX12-NEXT: buffer_atomic_pk_add_bf16 v0, v1, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu_no_remote_memory:
+; GFX940: ; %bb.0:
+; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT: v_mov_b32_e32 v1, v0
+; GFX940-NEXT: v_mov_b32_e32 v0, s4
+; GFX940-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024
+; GFX940-NEXT: s_addk_i32 s4, 0x400
+; GFX940-NEXT: s_mov_b64 s[6:7], 0
+; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v1
+; GFX940-NEXT: s_movk_i32 s8, 0x7fff
+; GFX940-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
+; GFX940-NEXT: s_mov_b32 s9, 0x7060302
+; GFX940-NEXT: v_mov_b32_e32 v4, s4
+; GFX940-NEXT: .LBB31_1: ; %atomicrmw.start
+; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX940-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NEXT: v_mov_b32_e32 v7, v0
+; GFX940-NEXT: v_lshlrev_b32_e32 v0, 16, v7
+; GFX940-NEXT: v_and_b32_e32 v1, 0xffff0000, v7
+; GFX940-NEXT: v_add_f32_e32 v0, v0, v2
+; GFX940-NEXT: v_add_f32_e32 v1, v1, v3
+; GFX940-NEXT: v_bfe_u32 v5, v0, 16, 1
+; GFX940-NEXT: v_bfe_u32 v8, v1, 16, 1
+; GFX940-NEXT: v_or_b32_e32 v6, 0x400000, v0
+; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v1
+; GFX940-NEXT: v_add3_u32 v5, v5, v0, s8
+; GFX940-NEXT: v_add3_u32 v8, v8, v1, s8
+; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
+; GFX940-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
+; GFX940-NEXT: buffer_wbl2 sc1
+; GFX940-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc
+; GFX940-NEXT: v_cndmask_b32_e64 v0, v5, v6, s[4:5]
+; GFX940-NEXT: v_perm_b32 v6, v1, v0, s9
+; GFX940-NEXT: v_mov_b64_e32 v[0:1], v[6:7]
+; GFX940-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[0:3], 0 offen sc0
+; GFX940-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NEXT: buffer_inv sc1
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v7
+; GFX940-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GFX940-NEXT: s_andn2_b64 exec, exec, s[6:7]
+; GFX940-NEXT: s_cbranch_execnz .LBB31_1
+; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX940-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX940-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu_no_remote_memory:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s4
+; GFX11-NEXT: s_addk_i32 s4, 0x400
+; GFX11-NEXT: s_mov_b32 s5, 0
+; GFX11-NEXT: v_mov_b32_e32 v4, s4
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v1
+; GFX11-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024
+; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
+; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX11-NEXT: .p2align 6
+; GFX11-NEXT: .LBB31_1: ; %atomicrmw.start
+; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_mov_b32_e32 v6, v0
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v6
+; GFX11-NEXT: v_add_f32_e32 v1, v1, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_bfe_u32 v7, v1, 16, 1
+; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v1
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-NEXT: v_add3_u32 v7, v7, v1, 0x7fff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_dual_cndmask_b32 v1, v7, v9 :: v_dual_lshlrev_b32 v0, 16, v6
+; GFX11-NEXT: v_add_f32_e32 v0, v0, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_bfe_u32 v5, v0, 16, 1
+; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0
+; GFX11-NEXT: v_cmp_u_f32_e64 s4, v0, v0
+; GFX11-NEXT: v_add3_u32 v5, v5, v0, 0x7fff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_cndmask_b32_e64 v0, v5, v8, s4
+; GFX11-NEXT: v_perm_b32 v5, v1, v0, 0x7060302
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_dual_mov_b32 v0, v5 :: v_dual_mov_b32 v1, v6
+; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v4, s[0:3], 0 offen glc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: buffer_gl1_inv
+; GFX11-NEXT: buffer_gl0_inv
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6
+; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
+; GFX11-NEXT: s_cbranch_execnz .LBB31_1
+; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s5
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu_no_remote_memory:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_mov_b32_e32 v1, v0
+; GFX10-NEXT: v_mov_b32_e32 v0, s8
+; GFX10-NEXT: s_addk_i32 s8, 0x400
+; GFX10-NEXT: s_mov_b32 s9, 0
+; GFX10-NEXT: v_mov_b32_e32 v4, s8
+; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v1
+; GFX10-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024
+; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
+; GFX10-NEXT: .LBB31_1: ; %atomicrmw.start
+; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_mov_b32_e32 v6, v0
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v6
+; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v6
+; GFX10-NEXT: v_add_f32_e32 v0, v0, v2
+; GFX10-NEXT: v_add_f32_e32 v1, v1, v3
+; GFX10-NEXT: v_bfe_u32 v5, v0, 16, 1
+; GFX10-NEXT: v_bfe_u32 v7, v1, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v0
+; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v1
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX10-NEXT: v_add3_u32 v5, v5, v0, 0x7fff
+; GFX10-NEXT: v_add3_u32 v7, v7, v1, 0x7fff
+; GFX10-NEXT: v_cmp_u_f32_e64 s8, v0, v0
+; GFX10-NEXT: v_cndmask_b32_e32 v1, v7, v9, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e64 v0, v5, v8, s8
+; GFX10-NEXT: v_perm_b32 v5, v1, v0, 0x7060302
+; GFX10-NEXT: v_mov_b32_e32 v0, v5
+; GFX10-NEXT: v_mov_b32_e32 v1, v6
+; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[4:7], 0 offen glc
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: buffer_gl1_inv
+; GFX10-NEXT: buffer_gl0_inv
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6
+; GFX10-NEXT: s_or_b32 s9, vcc_lo, s9
+; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s9
+; GFX10-NEXT: s_cbranch_execnz .LBB31_1
+; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s9
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu_no_remote_memory:
+; GFX90A: ; %bb.0:
+; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v1, v0
+; GFX90A-NEXT: v_mov_b32_e32 v0, s8
+; GFX90A-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024
+; GFX90A-NEXT: s_addk_i32 s8, 0x400
+; GFX90A-NEXT: s_mov_b64 s[10:11], 0
+; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v1
+; GFX90A-NEXT: s_movk_i32 s12, 0x7fff
+; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
+; GFX90A-NEXT: s_mov_b32 s13, 0x7060302
+; GFX90A-NEXT: v_mov_b32_e32 v4, s8
+; GFX90A-NEXT: .LBB31_1: ; %atomicrmw.start
+; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v7, v0
+; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v7
+; GFX90A-NEXT: v_and_b32_e32 v1, 0xffff0000, v7
+; GFX90A-NEXT: v_add_f32_e32 v0, v0, v2
+; GFX90A-NEXT: v_add_f32_e32 v1, v1, v3
+; GFX90A-NEXT: v_bfe_u32 v5, v0, 16, 1
+; GFX90A-NEXT: v_bfe_u32 v8, v1, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v0
+; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v1
+; GFX90A-NEXT: v_add3_u32 v5, v5, v0, s12
+; GFX90A-NEXT: v_add3_u32 v8, v8, v1, s12
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
+; GFX90A-NEXT: v_cmp_u_f32_e64 s[8:9], v0, v0
+; GFX90A-NEXT: v_cndmask_b32_e64 v0, v5, v6, s[8:9]
+; GFX90A-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc
+; GFX90A-NEXT: v_perm_b32 v6, v1, v0, s13
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[6:7] op_sel:[0,1]
+; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[4:7], 0 offen glc
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: buffer_wbinvl1
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v7
+; GFX90A-NEXT: s_or_b64 s[10:11], vcc, s[10:11]
+; GFX90A-NEXT: s_andn2_b64 exec, exec, s[10:11]
+; GFX90A-NEXT: s_cbranch_execnz .LBB31_1
+; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX90A-NEXT: s_or_b64 exec, exec, s[10:11]
+; GFX90A-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu_no_remote_memory:
+; GFX908: ; %bb.0:
+; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT: v_mov_b32_e32 v1, v0
+; GFX908-NEXT: v_mov_b32_e32 v0, s8
+; GFX908-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024
+; GFX908-NEXT: s_addk_i32 s8, 0x400
+; GFX908-NEXT: s_mov_b64 s[10:11], 0
+; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v1
+; GFX908-NEXT: s_movk_i32 s12, 0x7fff
+; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
+; GFX908-NEXT: s_mov_b32 s13, 0x7060302
+; GFX908-NEXT: v_mov_b32_e32 v4, s8
+; GFX908-NEXT: .LBB31_1: ; %atomicrmw.start
+; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: v_mov_b32_e32 v6, v0
+; GFX908-NEXT: v_lshlrev_b32_e32 v0, 16, v6
+; GFX908-NEXT: v_and_b32_e32 v1, 0xffff0000, v6
+; GFX908-NEXT: v_add_f32_e32 v0, v0, v2
+; GFX908-NEXT: v_add_f32_e32 v1, v1, v3
+; GFX908-NEXT: v_bfe_u32 v5, v0, 16, 1
+; GFX908-NEXT: v_bfe_u32 v8, v1, 16, 1
+; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v0
+; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v1
+; GFX908-NEXT: v_add3_u32 v5, v5, v0, s12
+; GFX908-NEXT: v_add3_u32 v8, v8, v1, s12
+; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
+; GFX908-NEXT: v_cmp_u_f32_e64 s[8:9], v0, v0
+; GFX908-NEXT: v_cndmask_b32_e64 v0, v5, v7, s[8:9]
+; GFX908-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc
+; GFX908-NEXT: v_perm_b32 v5, v1, v0, s13
+; GFX908-NEXT: v_mov_b32_e32 v0, v5
+; GFX908-NEXT: v_mov_b32_e32 v1, v6
+; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[4:7], 0 offen glc
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: buffer_wbinvl1
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6
+; GFX908-NEXT: s_or_b64 s[10:11], vcc, s[10:11]
+; GFX908-NEXT: s_andn2_b64 exec, exec, s[10:11]
+; GFX908-NEXT: s_cbranch_execnz .LBB31_1
+; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT: s_or_b64 exec, exec, s[10:11]
+; GFX908-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu_no_remote_memory:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v1, v0
+; GFX8-NEXT: v_mov_b32_e32 v0, s8
+; GFX8-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024
+; GFX8-NEXT: s_addk_i32 s8, 0x400
+; GFX8-NEXT: s_mov_b64 s[10:11], 0
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v1
+; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
+; GFX8-NEXT: v_mov_b32_e32 v4, s8
+; GFX8-NEXT: .LBB31_1: ; %atomicrmw.start
+; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v6, v0
+; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v6
; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v6
; GFX8-NEXT: v_add_f32_e32 v0, v0, v2
; GFX8-NEXT: v_add_f32_e32 v1, v1, v3
@@ -5891,12 +10523,12 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset(ptr add
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6
; GFX8-NEXT: s_or_b64 s[10:11], vcc, s[10:11]
; GFX8-NEXT: s_andn2_b64 exec, exec, s[10:11]
-; GFX8-NEXT: s_cbranch_execnz .LBB15_1
+; GFX8-NEXT: s_cbranch_execnz .LBB31_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX8-NEXT: s_or_b64 exec, exec, s[10:11]
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset:
+; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu_no_remote_memory:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v2, s8
@@ -5911,7 +10543,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset(ptr add
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v4
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v4
; GFX7-NEXT: v_mov_b32_e32 v4, s8
-; GFX7-NEXT: .LBB15_1: ; %atomicrmw.start
+; GFX7-NEXT: .LBB31_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
@@ -5933,12 +10565,12 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset(ptr add
; GFX7-NEXT: s_or_b64 s[10:11], vcc, s[10:11]
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v5
; GFX7-NEXT: s_andn2_b64 exec, exec, s[10:11]
-; GFX7-NEXT: s_cbranch_execnz .LBB15_1
+; GFX7-NEXT: s_cbranch_execnz .LBB31_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[10:11]
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
-; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset:
+; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__amdgpu_no_remote_memory:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: v_mov_b32_e32 v2, s8
@@ -5953,7 +10585,7 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset(ptr add
; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v4
; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v4
; GFX6-NEXT: v_mov_b32_e32 v4, s8
-; GFX6-NEXT: .LBB15_1: ; %atomicrmw.start
+; GFX6-NEXT: .LBB31_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0
@@ -5976,18 +10608,18 @@ define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset(ptr add
; GFX6-NEXT: s_or_b64 s[10:11], vcc, s[10:11]
; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v5
; GFX6-NEXT: s_andn2_b64 exec, exec, s[10:11]
-; GFX6-NEXT: s_cbranch_execnz .LBB15_1
+; GFX6-NEXT: s_cbranch_execnz .LBB31_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX6-NEXT: s_or_b64 exec, exec, s[10:11]
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr <2 x bfloat>, ptr addrspace(7) %ptr, i32 256
- %result = atomicrmw fadd ptr addrspace(7) %gep, <2 x bfloat> %val syncscope("agent") seq_cst
+ %result = atomicrmw fadd ptr addrspace(7) %gep, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
ret <2 x bfloat> %result
}
-define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset(ptr addrspace(7) inreg %ptr, <2 x bfloat> %val) #0 {
-; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset:
+define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_remote_memory(ptr addrspace(7) inreg %ptr, <2 x bfloat> %val) #0 {
+; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_remote_memory:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: s_wait_expcnt 0x0
@@ -6001,7 +10633,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset(ptr addrspace
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
-; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset:
+; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_remote_memory:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: v_mov_b32_e32 v1, s4
@@ -6013,7 +10645,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset(ptr addrspace
; GFX940-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
; GFX940-NEXT: s_mov_b32 s9, 0x7060302
; GFX940-NEXT: v_mov_b32_e32 v4, s4
-; GFX940-NEXT: .LBB16_1: ; %atomicrmw.start
+; GFX940-NEXT: .LBB32_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: v_lshlrev_b32_e32 v0, 16, v1
@@ -6040,12 +10672,12 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset(ptr addrspace
; GFX940-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX940-NEXT: v_mov_b32_e32 v1, v6
; GFX940-NEXT: s_andn2_b64 exec, exec, s[6:7]
-; GFX940-NEXT: s_cbranch_execnz .LBB16_1
+; GFX940-NEXT: s_cbranch_execnz .LBB32_1
; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX940-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset:
+; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_remote_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_dual_mov_b32 v1, s4 :: v_dual_lshlrev_b32 v2, 16, v0
@@ -6056,7 +10688,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset(ptr addrspace
; GFX11-NEXT: s_mov_b32 s5, 0
; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB16_1: ; %atomicrmw.start
+; GFX11-NEXT: .LBB32_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
@@ -6088,13 +10720,13 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset(ptr addrspace
; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
-; GFX11-NEXT: s_cbranch_execnz .LBB16_1
+; GFX11-NEXT: s_cbranch_execnz .LBB32_1
; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX11-NEXT: s_set_inst_prefetch_distance 0x2
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s5
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
-; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset:
+; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_remote_memory:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v1, s8
@@ -6104,7 +10736,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset(ptr addrspace
; GFX10-NEXT: v_mov_b32_e32 v4, s8
; GFX10-NEXT: buffer_load_dword v1, v1, s[4:7], 0 offen offset:1024
; GFX10-NEXT: s_mov_b32 s9, 0
-; GFX10-NEXT: .LBB16_1: ; %atomicrmw.start
+; GFX10-NEXT: .LBB32_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v1
@@ -6133,12 +10765,12 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset(ptr addrspace
; GFX10-NEXT: v_mov_b32_e32 v1, v5
; GFX10-NEXT: s_or_b32 s9, vcc_lo, s9
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s9
-; GFX10-NEXT: s_cbranch_execnz .LBB16_1
+; GFX10-NEXT: s_cbranch_execnz .LBB32_1
; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s9
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset:
+; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_remote_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v1, s8
@@ -6150,7 +10782,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset(ptr addrspace
; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
; GFX90A-NEXT: s_mov_b32 s13, 0x7060302
; GFX90A-NEXT: v_mov_b32_e32 v4, s8
-; GFX90A-NEXT: .LBB16_1: ; %atomicrmw.start
+; GFX90A-NEXT: .LBB32_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v1
@@ -6176,12 +10808,12 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset(ptr addrspace
; GFX90A-NEXT: s_or_b64 s[10:11], vcc, s[10:11]
; GFX90A-NEXT: v_mov_b32_e32 v1, v6
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[10:11]
-; GFX90A-NEXT: s_cbranch_execnz .LBB16_1
+; GFX90A-NEXT: s_cbranch_execnz .LBB32_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[10:11]
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
-; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset:
+; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_remote_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: v_mov_b32_e32 v1, s8
@@ -6193,7 +10825,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset(ptr addrspace
; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
; GFX908-NEXT: s_mov_b32 s13, 0x7060302
; GFX908-NEXT: v_mov_b32_e32 v4, s8
-; GFX908-NEXT: .LBB16_1: ; %atomicrmw.start
+; GFX908-NEXT: .LBB32_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: v_lshlrev_b32_e32 v0, 16, v1
@@ -6220,12 +10852,12 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset(ptr addrspace
; GFX908-NEXT: s_or_b64 s[10:11], vcc, s[10:11]
; GFX908-NEXT: v_mov_b32_e32 v1, v5
; GFX908-NEXT: s_andn2_b64 exec, exec, s[10:11]
-; GFX908-NEXT: s_cbranch_execnz .LBB16_1
+; GFX908-NEXT: s_cbranch_execnz .LBB32_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX908-NEXT: s_or_b64 exec, exec, s[10:11]
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
-; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset:
+; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_remote_memory:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s8
@@ -6235,7 +10867,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset(ptr addrspace
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v0
; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
; GFX8-NEXT: v_mov_b32_e32 v4, s8
-; GFX8-NEXT: .LBB16_1: ; %atomicrmw.start
+; GFX8-NEXT: .LBB32_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v1
@@ -6265,12 +10897,12 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset(ptr addrspace
; GFX8-NEXT: s_or_b64 s[10:11], vcc, s[10:11]
; GFX8-NEXT: v_mov_b32_e32 v1, v5
; GFX8-NEXT: s_andn2_b64 exec, exec, s[10:11]
-; GFX8-NEXT: s_cbranch_execnz .LBB16_1
+; GFX8-NEXT: s_cbranch_execnz .LBB32_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX8-NEXT: s_or_b64 exec, exec, s[10:11]
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset:
+; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_remote_memory:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v2, s8
@@ -6285,7 +10917,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset(ptr addrspace
; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX7-NEXT: v_mov_b32_e32 v2, s8
-; GFX7-NEXT: .LBB16_1: ; %atomicrmw.start
+; GFX7-NEXT: .LBB32_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
@@ -6307,12 +10939,12 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset(ptr addrspace
; GFX7-NEXT: s_or_b64 s[10:11], vcc, s[10:11]
; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v5
; GFX7-NEXT: s_andn2_b64 exec, exec, s[10:11]
-; GFX7-NEXT: s_cbranch_execnz .LBB16_1
+; GFX7-NEXT: s_cbranch_execnz .LBB32_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[10:11]
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
-; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset:
+; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_remote_memory:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: v_mov_b32_e32 v2, s8
@@ -6327,7 +10959,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset(ptr addrspace
; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX6-NEXT: v_mov_b32_e32 v2, s8
-; GFX6-NEXT: .LBB16_1: ; %atomicrmw.start
+; GFX6-NEXT: .LBB32_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4
@@ -6350,679 +10982,396 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset(ptr addrspace
; GFX6-NEXT: s_or_b64 s[10:11], vcc, s[10:11]
; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v5
; GFX6-NEXT: s_andn2_b64 exec, exec, s[10:11]
-; GFX6-NEXT: s_cbranch_execnz .LBB16_1
+; GFX6-NEXT: s_cbranch_execnz .LBB32_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX6-NEXT: s_or_b64 exec, exec, s[10:11]
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr <2 x bfloat>, ptr addrspace(7) %ptr, i32 256
- %unused = atomicrmw fadd ptr addrspace(7) %gep, <2 x bfloat> %val syncscope("agent") seq_cst
+ %unused = atomicrmw fadd ptr addrspace(7) %gep, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
ret void
}
-define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterfall(ptr addrspace(7) %ptr, <2 x bfloat> %val) #0 {
-; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterfall:
+define void @buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(7) inreg %ptr, <2 x bfloat> %val) #0 {
+; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: s_mov_b32 s1, exec_lo
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: .LBB17_1: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: v_readfirstlane_b32 s4, v0
-; GFX12-NEXT: v_readfirstlane_b32 s5, v1
-; GFX12-NEXT: v_readfirstlane_b32 s6, v2
-; GFX12-NEXT: v_readfirstlane_b32 s7, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
-; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_and_saveexec_b32 s0, s0
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: buffer_atomic_pk_add_bf16 v5, v4, s[4:7], null offen offset:1024 th:TH_ATOMIC_RETURN
-; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
-; GFX12-NEXT: ; implicit-def: $vgpr4
-; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB17_1
-; GFX12-NEXT: ; %bb.2:
-; GFX12-NEXT: s_mov_b32 exec_lo, s1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v0, v5
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_mov_b32_e32 v1, s4
+; GFX12-NEXT: s_wait_storecnt 0x0
+; GFX12-NEXT: buffer_atomic_pk_add_bf16 v0, v1, s[0:3], null offen offset:1024
+; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
-; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterfall:
+; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_add_u32_e32 v8, 0x400, v4
-; GFX940-NEXT: s_mov_b64 s[2:3], exec
-; GFX940-NEXT: .LBB17_1: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: v_readfirstlane_b32 s4, v0
-; GFX940-NEXT: v_readfirstlane_b32 s5, v1
-; GFX940-NEXT: v_readfirstlane_b32 s6, v2
-; GFX940-NEXT: v_readfirstlane_b32 s7, v3
-; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1]
-; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
-; GFX940-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT: s_and_saveexec_b64 s[0:1], s[0:1]
-; GFX940-NEXT: buffer_load_dword v7, v4, s[4:7], 0 offen offset:1024
-; GFX940-NEXT: ; implicit-def: $vgpr4
-; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1]
-; GFX940-NEXT: s_cbranch_execnz .LBB17_1
-; GFX940-NEXT: ; %bb.2:
-; GFX940-NEXT: s_mov_b64 exec, s[2:3]
-; GFX940-NEXT: s_mov_b64 s[2:3], 0
-; GFX940-NEXT: v_lshlrev_b32_e32 v9, 16, v5
-; GFX940-NEXT: s_movk_i32 s10, 0x7fff
-; GFX940-NEXT: v_and_b32_e32 v10, 0xffff0000, v5
-; GFX940-NEXT: s_mov_b32 s11, 0x7060302
-; GFX940-NEXT: .LBB17_3: ; %atomicrmw.start
-; GFX940-NEXT: ; =>This Loop Header: Depth=1
-; GFX940-NEXT: ; Child Loop BB17_4 Depth 2
+; GFX940-NEXT: v_mov_b32_e32 v1, s4
+; GFX940-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen offset:1024
+; GFX940-NEXT: s_addk_i32 s4, 0x400
+; GFX940-NEXT: s_mov_b64 s[6:7], 0
+; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v0
+; GFX940-NEXT: s_movk_i32 s8, 0x7fff
+; GFX940-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
+; GFX940-NEXT: s_mov_b32 s9, 0x7060302
+; GFX940-NEXT: v_mov_b32_e32 v4, s4
+; GFX940-NEXT: .LBB33_1: ; %atomicrmw.start
+; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v7
-; GFX940-NEXT: v_add_f32_e32 v4, v4, v9
-; GFX940-NEXT: v_bfe_u32 v5, v4, 16, 1
-; GFX940-NEXT: v_add3_u32 v5, v5, v4, s10
-; GFX940-NEXT: v_or_b32_e32 v6, 0x400000, v4
-; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
-; GFX940-NEXT: s_mov_b64 s[8:9], exec
-; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc
-; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v7
-; GFX940-NEXT: v_add_f32_e32 v5, v5, v10
-; GFX940-NEXT: v_bfe_u32 v6, v5, 16, 1
-; GFX940-NEXT: v_add3_u32 v6, v6, v5, s10
-; GFX940-NEXT: v_or_b32_e32 v11, 0x400000, v5
+; GFX940-NEXT: v_lshlrev_b32_e32 v0, 16, v1
+; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
+; GFX940-NEXT: v_add_f32_e32 v0, v0, v2
+; GFX940-NEXT: v_add_f32_e32 v5, v5, v3
+; GFX940-NEXT: v_bfe_u32 v6, v0, 16, 1
+; GFX940-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX940-NEXT: v_or_b32_e32 v7, 0x400000, v0
+; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX940-NEXT: v_add3_u32 v6, v6, v0, s8
+; GFX940-NEXT: v_add3_u32 v8, v8, v5, s8
; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX940-NEXT: s_nop 1
-; GFX940-NEXT: v_cndmask_b32_e32 v5, v6, v11, vcc
-; GFX940-NEXT: v_perm_b32 v6, v5, v4, s11
-; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[6:7]
-; GFX940-NEXT: .LBB17_4: ; Parent Loop BB17_3 Depth=1
-; GFX940-NEXT: ; => This Inner Loop Header: Depth=2
-; GFX940-NEXT: v_readfirstlane_b32 s4, v0
-; GFX940-NEXT: v_readfirstlane_b32 s5, v1
-; GFX940-NEXT: v_readfirstlane_b32 s6, v2
-; GFX940-NEXT: v_readfirstlane_b32 s7, v3
-; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1]
-; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
-; GFX940-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT: s_and_saveexec_b64 s[0:1], s[0:1]
-; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[4:7], 0 offen sc0
-; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1]
-; GFX940-NEXT: s_cbranch_execnz .LBB17_4
-; GFX940-NEXT: ; %bb.5: ; in Loop: Header=BB17_3 Depth=1
-; GFX940-NEXT: s_mov_b64 exec, s[8:9]
+; GFX940-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
+; GFX940-NEXT: buffer_wbl2 sc1
+; GFX940-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX940-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5]
+; GFX940-NEXT: v_perm_b32 v0, v5, v0, s9
+; GFX940-NEXT: v_mov_b64_e32 v[6:7], v[0:1]
+; GFX940-NEXT: buffer_atomic_cmpswap v[6:7], v4, s[0:3], 0 offen sc0
; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7
-; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX940-NEXT: v_mov_b32_e32 v7, v4
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3]
-; GFX940-NEXT: s_cbranch_execnz .LBB17_3
-; GFX940-NEXT: ; %bb.6: ; %atomicrmw.end
-; GFX940-NEXT: s_or_b64 exec, exec, s[2:3]
-; GFX940-NEXT: v_mov_b32_e32 v0, v4
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v6, v1
+; GFX940-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GFX940-NEXT: v_mov_b32_e32 v1, v6
+; GFX940-NEXT: s_andn2_b64 exec, exec, s[6:7]
+; GFX940-NEXT: s_cbranch_execnz .LBB33_1
+; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX940-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterfall:
+; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_add_nc_u32_e32 v7, 0x400, v4
-; GFX11-NEXT: s_mov_b32 s1, 0
-; GFX11-NEXT: s_mov_b32 s2, exec_lo
-; GFX11-NEXT: .LBB17_1: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: v_readfirstlane_b32 s4, v0
-; GFX11-NEXT: v_readfirstlane_b32 s5, v1
-; GFX11-NEXT: v_readfirstlane_b32 s6, v2
-; GFX11-NEXT: v_readfirstlane_b32 s7, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
-; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_and_saveexec_b32 s0, s0
-; GFX11-NEXT: buffer_load_b32 v6, v4, s[4:7], 0 offen offset:1024
-; GFX11-NEXT: ; implicit-def: $vgpr4
-; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB17_1
-; GFX11-NEXT: ; %bb.2:
-; GFX11-NEXT: s_mov_b32 exec_lo, s2
-; GFX11-NEXT: v_lshlrev_b32_e32 v8, 16, v5
-; GFX11-NEXT: v_and_b32_e32 v9, 0xffff0000, v5
+; GFX11-NEXT: v_dual_mov_b32 v1, s4 :: v_dual_lshlrev_b32 v2, 16, v0
+; GFX11-NEXT: s_addk_i32 s4, 0x400
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v0
+; GFX11-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024
+; GFX11-NEXT: s_mov_b32 s5, 0
; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB17_3: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Loop Header: Depth=1
-; GFX11-NEXT: ; Child Loop BB17_4 Depth 2
+; GFX11-NEXT: .LBB33_1: ; %atomicrmw.start
+; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
-; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v6
-; GFX11-NEXT: s_mov_b32 s2, exec_lo
+; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
+; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v1
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_dual_add_f32 v5, v5, v9 :: v_dual_add_f32 v4, v4, v8
-; GFX11-NEXT: v_bfe_u32 v11, v5, 16, 1
+; GFX11-NEXT: v_dual_add_f32 v5, v5, v3 :: v_dual_add_f32 v0, v0, v2
+; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT: v_bfe_u32 v10, v4, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v12, 0x400000, v4
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX11-NEXT: v_or_b32_e32 v13, 0x400000, v5
-; GFX11-NEXT: v_add3_u32 v11, v11, v5, 0x7fff
-; GFX11-NEXT: v_add3_u32 v10, v10, v4, 0x7fff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_cndmask_b32_e32 v4, v10, v12, vcc_lo
+; GFX11-NEXT: v_bfe_u32 v6, v0, 16, 1
+; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0
+; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: v_cndmask_b32_e32 v5, v11, v13, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_perm_b32 v5, v5, v4, 0x7060302
-; GFX11-NEXT: v_mov_b32_e32 v4, v5
-; GFX11-NEXT: v_mov_b32_e32 v5, v6
-; GFX11-NEXT: .LBB17_4: ; Parent Loop BB17_3 Depth=1
-; GFX11-NEXT: ; => This Inner Loop Header: Depth=2
-; GFX11-NEXT: v_readfirstlane_b32 s4, v0
-; GFX11-NEXT: v_readfirstlane_b32 s5, v1
-; GFX11-NEXT: v_readfirstlane_b32 s6, v2
-; GFX11-NEXT: v_readfirstlane_b32 s7, v3
+; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-NEXT: v_add3_u32 v6, v6, v0, 0x7fff
+; GFX11-NEXT: v_cmp_u_f32_e64 s4, v0, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
-; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_and_saveexec_b32 s0, s0
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], 0 offen glc
-; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB17_4
-; GFX11-NEXT: ; %bb.5: ; in Loop: Header=BB17_3 Depth=1
-; GFX11-NEXT: s_mov_b32 exec_lo, s2
+; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX11-NEXT: v_cndmask_b32_e64 v0, v6, v8, s4
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_perm_b32 v0, v5, v0, 0x7060302
+; GFX11-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v5, v0
+; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], 0 offen glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6
-; GFX11-NEXT: v_mov_b32_e32 v6, v4
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v1
+; GFX11-NEXT: v_mov_b32_e32 v1, v5
+; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT: s_cbranch_execnz .LBB17_3
-; GFX11-NEXT: ; %bb.6: ; %atomicrmw.end
+; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
+; GFX11-NEXT: s_cbranch_execnz .LBB33_1
+; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX11-NEXT: s_set_inst_prefetch_distance 0x2
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT: v_mov_b32_e32 v0, v4
+; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s5
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
-; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterfall:
+; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_add_nc_u32_e32 v7, 0x400, v4
-; GFX10-NEXT: s_mov_b32 s5, 0
-; GFX10-NEXT: s_mov_b32 s6, exec_lo
-; GFX10-NEXT: .LBB17_1: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: v_readfirstlane_b32 s8, v0
-; GFX10-NEXT: v_readfirstlane_b32 s9, v1
-; GFX10-NEXT: v_readfirstlane_b32 s10, v2
-; GFX10-NEXT: v_readfirstlane_b32 s11, v3
-; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[8:9], v[0:1]
-; GFX10-NEXT: v_cmp_eq_u64_e64 s4, s[10:11], v[2:3]
-; GFX10-NEXT: s_and_b32 s4, vcc_lo, s4
-; GFX10-NEXT: s_and_saveexec_b32 s4, s4
-; GFX10-NEXT: buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024
-; GFX10-NEXT: ; implicit-def: $vgpr4
-; GFX10-NEXT: s_waitcnt_depctr 0xffe3
-; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: s_cbranch_execnz .LBB17_1
-; GFX10-NEXT: ; %bb.2:
-; GFX10-NEXT: s_mov_b32 exec_lo, s6
-; GFX10-NEXT: v_lshlrev_b32_e32 v8, 16, v5
-; GFX10-NEXT: v_and_b32_e32 v9, 0xffff0000, v5
-; GFX10-NEXT: .LBB17_3: ; %atomicrmw.start
-; GFX10-NEXT: ; =>This Loop Header: Depth=1
-; GFX10-NEXT: ; Child Loop BB17_4 Depth 2
-; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v6
-; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
-; GFX10-NEXT: s_mov_b32 s6, exec_lo
-; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: v_add_f32_e32 v4, v4, v8
-; GFX10-NEXT: v_add_f32_e32 v5, v5, v9
-; GFX10-NEXT: v_bfe_u32 v10, v4, 16, 1
-; GFX10-NEXT: v_bfe_u32 v11, v5, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v12, 0x400000, v4
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
-; GFX10-NEXT: v_or_b32_e32 v13, 0x400000, v5
-; GFX10-NEXT: v_add3_u32 v10, v10, v4, 0x7fff
-; GFX10-NEXT: v_add3_u32 v11, v11, v5, 0x7fff
-; GFX10-NEXT: v_cndmask_b32_e32 v4, v10, v12, vcc_lo
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX10-NEXT: v_cndmask_b32_e32 v5, v11, v13, vcc_lo
-; GFX10-NEXT: v_perm_b32 v5, v5, v4, 0x7060302
-; GFX10-NEXT: v_mov_b32_e32 v4, v5
-; GFX10-NEXT: v_mov_b32_e32 v5, v6
-; GFX10-NEXT: .LBB17_4: ; Parent Loop BB17_3 Depth=1
-; GFX10-NEXT: ; => This Inner Loop Header: Depth=2
-; GFX10-NEXT: v_readfirstlane_b32 s8, v0
-; GFX10-NEXT: v_readfirstlane_b32 s9, v1
-; GFX10-NEXT: v_readfirstlane_b32 s10, v2
-; GFX10-NEXT: v_readfirstlane_b32 s11, v3
-; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[8:9], v[0:1]
-; GFX10-NEXT: v_cmp_eq_u64_e64 s4, s[10:11], v[2:3]
-; GFX10-NEXT: s_and_b32 s4, vcc_lo, s4
-; GFX10-NEXT: s_and_saveexec_b32 s4, s4
+; GFX10-NEXT: v_mov_b32_e32 v1, s8
+; GFX10-NEXT: s_addk_i32 s8, 0x400
+; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v0
+; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
+; GFX10-NEXT: v_mov_b32_e32 v4, s8
+; GFX10-NEXT: buffer_load_dword v1, v1, s[4:7], 0 offen offset:1024
+; GFX10-NEXT: s_mov_b32 s9, 0
+; GFX10-NEXT: .LBB33_1: ; %atomicrmw.start
+; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc
-; GFX10-NEXT: s_waitcnt_depctr 0xffe3
-; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: s_cbranch_execnz .LBB17_4
-; GFX10-NEXT: ; %bb.5: ; in Loop: Header=BB17_3 Depth=1
-; GFX10-NEXT: s_mov_b32 exec_lo, s6
+; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v1
+; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: v_add_f32_e32 v0, v0, v2
+; GFX10-NEXT: v_add_f32_e32 v5, v5, v3
+; GFX10-NEXT: v_bfe_u32 v6, v0, 16, 1
+; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v0
+; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX10-NEXT: v_add3_u32 v6, v6, v0, 0x7fff
+; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX10-NEXT: v_cmp_u_f32_e64 s8, v0, v0
+; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e64 v0, v6, v8, s8
+; GFX10-NEXT: v_perm_b32 v0, v5, v0, 0x7060302
+; GFX10-NEXT: v_mov_b32_e32 v6, v1
+; GFX10-NEXT: v_mov_b32_e32 v5, v0
+; GFX10-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[4:7], 0 offen glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6
-; GFX10-NEXT: v_mov_b32_e32 v6, v4
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5
-; GFX10-NEXT: s_waitcnt_depctr 0xffe3
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5
-; GFX10-NEXT: s_cbranch_execnz .LBB17_3
-; GFX10-NEXT: ; %bb.6: ; %atomicrmw.end
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5
-; GFX10-NEXT: v_mov_b32_e32 v0, v4
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v1
+; GFX10-NEXT: v_mov_b32_e32 v1, v5
+; GFX10-NEXT: s_or_b32 s9, vcc_lo, s9
+; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s9
+; GFX10-NEXT: s_cbranch_execnz .LBB33_1
+; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s9
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterfall:
+; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_add_u32_e32 v8, 0x400, v4
-; GFX90A-NEXT: s_mov_b64 s[6:7], exec
-; GFX90A-NEXT: .LBB17_1: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: v_readfirstlane_b32 s8, v0
-; GFX90A-NEXT: v_readfirstlane_b32 s9, v1
-; GFX90A-NEXT: v_readfirstlane_b32 s10, v2
-; GFX90A-NEXT: v_readfirstlane_b32 s11, v3
-; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
-; GFX90A-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
-; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
-; GFX90A-NEXT: s_nop 0
-; GFX90A-NEXT: buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024
-; GFX90A-NEXT: ; implicit-def: $vgpr4
-; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: s_cbranch_execnz .LBB17_1
-; GFX90A-NEXT: ; %bb.2:
-; GFX90A-NEXT: s_mov_b64 exec, s[6:7]
-; GFX90A-NEXT: s_mov_b64 s[6:7], 0
-; GFX90A-NEXT: v_lshlrev_b32_e32 v9, 16, v5
-; GFX90A-NEXT: s_movk_i32 s14, 0x7fff
-; GFX90A-NEXT: v_and_b32_e32 v10, 0xffff0000, v5
-; GFX90A-NEXT: s_mov_b32 s15, 0x7060302
-; GFX90A-NEXT: .LBB17_3: ; %atomicrmw.start
-; GFX90A-NEXT: ; =>This Loop Header: Depth=1
-; GFX90A-NEXT: ; Child Loop BB17_4 Depth 2
+; GFX90A-NEXT: v_mov_b32_e32 v1, s8
+; GFX90A-NEXT: buffer_load_dword v1, v1, s[4:7], 0 offen offset:1024
+; GFX90A-NEXT: s_addk_i32 s8, 0x400
+; GFX90A-NEXT: s_mov_b64 s[10:11], 0
+; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v0
+; GFX90A-NEXT: s_movk_i32 s12, 0x7fff
+; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
+; GFX90A-NEXT: s_mov_b32 s13, 0x7060302
+; GFX90A-NEXT: v_mov_b32_e32 v4, s8
+; GFX90A-NEXT: .LBB33_1: ; %atomicrmw.start
+; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v7
-; GFX90A-NEXT: v_add_f32_e32 v4, v4, v9
-; GFX90A-NEXT: v_bfe_u32 v5, v4, 16, 1
-; GFX90A-NEXT: v_add3_u32 v5, v5, v4, s14
-; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v4
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
-; GFX90A-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc
-; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7
-; GFX90A-NEXT: v_add_f32_e32 v5, v5, v10
-; GFX90A-NEXT: v_bfe_u32 v6, v5, 16, 1
-; GFX90A-NEXT: v_add3_u32 v6, v6, v5, s14
-; GFX90A-NEXT: v_or_b32_e32 v11, 0x400000, v5
+; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v1
+; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
+; GFX90A-NEXT: v_add_f32_e32 v0, v0, v2
+; GFX90A-NEXT: v_add_f32_e32 v5, v5, v3
+; GFX90A-NEXT: v_bfe_u32 v6, v0, 16, 1
+; GFX90A-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v0
+; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX90A-NEXT: v_add3_u32 v6, v6, v0, s12
+; GFX90A-NEXT: v_add3_u32 v8, v8, v5, s12
; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX90A-NEXT: v_cndmask_b32_e32 v5, v6, v11, vcc
-; GFX90A-NEXT: v_perm_b32 v6, v5, v4, s15
-; GFX90A-NEXT: s_mov_b64 s[12:13], exec
-; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1]
-; GFX90A-NEXT: .LBB17_4: ; Parent Loop BB17_3 Depth=1
-; GFX90A-NEXT: ; => This Inner Loop Header: Depth=2
-; GFX90A-NEXT: v_readfirstlane_b32 s8, v0
-; GFX90A-NEXT: v_readfirstlane_b32 s9, v1
-; GFX90A-NEXT: v_readfirstlane_b32 s10, v2
-; GFX90A-NEXT: v_readfirstlane_b32 s11, v3
-; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
-; GFX90A-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
-; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
-; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc
-; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: s_cbranch_execnz .LBB17_4
-; GFX90A-NEXT: ; %bb.5: ; in Loop: Header=BB17_3 Depth=1
-; GFX90A-NEXT: s_mov_b64 exec, s[12:13]
+; GFX90A-NEXT: v_cmp_u_f32_e64 s[8:9], v0, v0
+; GFX90A-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[8:9]
+; GFX90A-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX90A-NEXT: v_perm_b32 v0, v5, v0, s13
+; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[0:1], v[0:1] op_sel:[0,1]
+; GFX90A-NEXT: buffer_atomic_cmpswap v[6:7], v4, s[4:7], 0 offen glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7
-; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX90A-NEXT: v_mov_b32_e32 v7, v4
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
-; GFX90A-NEXT: s_cbranch_execnz .LBB17_3
-; GFX90A-NEXT: ; %bb.6: ; %atomicrmw.end
-; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX90A-NEXT: v_mov_b32_e32 v0, v4
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v6, v1
+; GFX90A-NEXT: s_or_b64 s[10:11], vcc, s[10:11]
+; GFX90A-NEXT: v_mov_b32_e32 v1, v6
+; GFX90A-NEXT: s_andn2_b64 exec, exec, s[10:11]
+; GFX90A-NEXT: s_cbranch_execnz .LBB33_1
+; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX90A-NEXT: s_or_b64 exec, exec, s[10:11]
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
-; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterfall:
+; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_add_u32_e32 v7, 0x400, v4
-; GFX908-NEXT: s_mov_b64 s[6:7], exec
-; GFX908-NEXT: .LBB17_1: ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT: v_readfirstlane_b32 s8, v0
-; GFX908-NEXT: v_readfirstlane_b32 s9, v1
-; GFX908-NEXT: v_readfirstlane_b32 s10, v2
-; GFX908-NEXT: v_readfirstlane_b32 s11, v3
-; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
-; GFX908-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
-; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
-; GFX908-NEXT: s_nop 0
-; GFX908-NEXT: buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024
-; GFX908-NEXT: ; implicit-def: $vgpr4
-; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5]
-; GFX908-NEXT: s_cbranch_execnz .LBB17_1
-; GFX908-NEXT: ; %bb.2:
-; GFX908-NEXT: s_mov_b64 exec, s[6:7]
-; GFX908-NEXT: s_mov_b64 s[6:7], 0
-; GFX908-NEXT: v_lshlrev_b32_e32 v8, 16, v5
-; GFX908-NEXT: s_movk_i32 s14, 0x7fff
-; GFX908-NEXT: v_and_b32_e32 v9, 0xffff0000, v5
-; GFX908-NEXT: s_mov_b32 s15, 0x7060302
-; GFX908-NEXT: .LBB17_3: ; %atomicrmw.start
-; GFX908-NEXT: ; =>This Loop Header: Depth=1
-; GFX908-NEXT: ; Child Loop BB17_4 Depth 2
+; GFX908-NEXT: v_mov_b32_e32 v1, s8
+; GFX908-NEXT: buffer_load_dword v1, v1, s[4:7], 0 offen offset:1024
+; GFX908-NEXT: s_addk_i32 s8, 0x400
+; GFX908-NEXT: s_mov_b64 s[10:11], 0
+; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v0
+; GFX908-NEXT: s_movk_i32 s12, 0x7fff
+; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
+; GFX908-NEXT: s_mov_b32 s13, 0x7060302
+; GFX908-NEXT: v_mov_b32_e32 v4, s8
+; GFX908-NEXT: .LBB33_1: ; %atomicrmw.start
+; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v6
-; GFX908-NEXT: v_add_f32_e32 v4, v4, v8
-; GFX908-NEXT: v_bfe_u32 v5, v4, 16, 1
-; GFX908-NEXT: v_add3_u32 v5, v5, v4, s14
-; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v4
-; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
-; GFX908-NEXT: v_cndmask_b32_e32 v4, v5, v10, vcc
-; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
-; GFX908-NEXT: v_add_f32_e32 v5, v5, v9
-; GFX908-NEXT: v_bfe_u32 v10, v5, 16, 1
-; GFX908-NEXT: v_add3_u32 v10, v10, v5, s14
-; GFX908-NEXT: v_or_b32_e32 v11, 0x400000, v5
+; GFX908-NEXT: v_lshlrev_b32_e32 v0, 16, v1
+; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
+; GFX908-NEXT: v_add_f32_e32 v0, v0, v2
+; GFX908-NEXT: v_add_f32_e32 v5, v5, v3
+; GFX908-NEXT: v_bfe_u32 v6, v0, 16, 1
+; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v0
+; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX908-NEXT: v_add3_u32 v6, v6, v0, s12
+; GFX908-NEXT: v_add3_u32 v8, v8, v5, s12
; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX908-NEXT: v_cndmask_b32_e32 v5, v10, v11, vcc
-; GFX908-NEXT: v_perm_b32 v5, v5, v4, s15
-; GFX908-NEXT: v_mov_b32_e32 v4, v5
-; GFX908-NEXT: s_mov_b64 s[12:13], exec
-; GFX908-NEXT: v_mov_b32_e32 v5, v6
-; GFX908-NEXT: .LBB17_4: ; Parent Loop BB17_3 Depth=1
-; GFX908-NEXT: ; => This Inner Loop Header: Depth=2
-; GFX908-NEXT: v_readfirstlane_b32 s8, v0
-; GFX908-NEXT: v_readfirstlane_b32 s9, v1
-; GFX908-NEXT: v_readfirstlane_b32 s10, v2
-; GFX908-NEXT: v_readfirstlane_b32 s11, v3
-; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
-; GFX908-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
-; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
-; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc
-; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5]
-; GFX908-NEXT: s_cbranch_execnz .LBB17_4
-; GFX908-NEXT: ; %bb.5: ; in Loop: Header=BB17_3 Depth=1
-; GFX908-NEXT: s_mov_b64 exec, s[12:13]
+; GFX908-NEXT: v_cmp_u_f32_e64 s[8:9], v0, v0
+; GFX908-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[8:9]
+; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX908-NEXT: v_perm_b32 v0, v5, v0, s13
+; GFX908-NEXT: v_mov_b32_e32 v6, v1
+; GFX908-NEXT: v_mov_b32_e32 v5, v0
+; GFX908-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[4:7], 0 offen glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6
-; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX908-NEXT: v_mov_b32_e32 v6, v4
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7]
-; GFX908-NEXT: s_cbranch_execnz .LBB17_3
-; GFX908-NEXT: ; %bb.6: ; %atomicrmw.end
-; GFX908-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX908-NEXT: v_mov_b32_e32 v0, v4
-; GFX908-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterfall:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x400, v4
-; GFX8-NEXT: s_mov_b64 s[6:7], exec
-; GFX8-NEXT: .LBB17_1: ; =>This Inner Loop Header: Depth=1
-; GFX8-NEXT: v_readfirstlane_b32 s8, v0
-; GFX8-NEXT: v_readfirstlane_b32 s9, v1
-; GFX8-NEXT: v_readfirstlane_b32 s10, v2
-; GFX8-NEXT: v_readfirstlane_b32 s11, v3
-; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
-; GFX8-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
-; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
-; GFX8-NEXT: s_nop 0
-; GFX8-NEXT: buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024
-; GFX8-NEXT: ; implicit-def: $vgpr4
-; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5]
-; GFX8-NEXT: s_cbranch_execnz .LBB17_1
-; GFX8-NEXT: ; %bb.2:
-; GFX8-NEXT: s_mov_b64 exec, s[6:7]
-; GFX8-NEXT: s_mov_b64 s[6:7], 0
-; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v5
-; GFX8-NEXT: v_and_b32_e32 v9, 0xffff0000, v5
-; GFX8-NEXT: .LBB17_3: ; %atomicrmw.start
-; GFX8-NEXT: ; =>This Loop Header: Depth=1
-; GFX8-NEXT: ; Child Loop BB17_4 Depth 2
-; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v6
-; GFX8-NEXT: v_add_f32_e32 v4, v4, v8
-; GFX8-NEXT: v_bfe_u32 v5, v4, 16, 1
-; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v4
-; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5
-; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v4
-; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
-; GFX8-NEXT: v_cndmask_b32_e32 v4, v5, v10, vcc
-; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
-; GFX8-NEXT: v_add_f32_e32 v5, v5, v9
-; GFX8-NEXT: v_bfe_u32 v10, v5, 16, 1
-; GFX8-NEXT: v_add_u32_e32 v10, vcc, v10, v5
-; GFX8-NEXT: v_add_u32_e32 v10, vcc, 0x7fff, v10
-; GFX8-NEXT: v_or_b32_e32 v11, 0x400000, v5
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1
+; GFX908-NEXT: s_or_b64 s[10:11], vcc, s[10:11]
+; GFX908-NEXT: v_mov_b32_e32 v1, v5
+; GFX908-NEXT: s_andn2_b64 exec, exec, s[10:11]
+; GFX908-NEXT: s_cbranch_execnz .LBB33_1
+; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT: s_or_b64 exec, exec, s[10:11]
+; GFX908-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v1, s8
+; GFX8-NEXT: buffer_load_dword v1, v1, s[4:7], 0 offen offset:1024
+; GFX8-NEXT: s_addk_i32 s8, 0x400
+; GFX8-NEXT: s_mov_b64 s[10:11], 0
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v0
+; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
+; GFX8-NEXT: v_mov_b32_e32 v4, s8
+; GFX8-NEXT: .LBB33_1: ; %atomicrmw.start
+; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v1
+; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
+; GFX8-NEXT: v_add_f32_e32 v0, v0, v2
+; GFX8-NEXT: v_add_f32_e32 v5, v5, v3
+; GFX8-NEXT: v_bfe_u32 v6, v0, 16, 1
+; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v0
+; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5
+; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6
+; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8
+; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX8-NEXT: v_cndmask_b32_e32 v5, v10, v11, vcc
+; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v0
+; GFX8-NEXT: v_cmp_u_f32_e64 s[8:9], v0, v0
+; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[8:9]
; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT: v_alignbit_b32 v5, v5, v4, 16
-; GFX8-NEXT: v_mov_b32_e32 v4, v5
-; GFX8-NEXT: s_mov_b64 s[12:13], exec
-; GFX8-NEXT: v_mov_b32_e32 v5, v6
-; GFX8-NEXT: .LBB17_4: ; Parent Loop BB17_3 Depth=1
-; GFX8-NEXT: ; => This Inner Loop Header: Depth=2
-; GFX8-NEXT: v_readfirstlane_b32 s8, v0
-; GFX8-NEXT: v_readfirstlane_b32 s9, v1
-; GFX8-NEXT: v_readfirstlane_b32 s10, v2
-; GFX8-NEXT: v_readfirstlane_b32 s11, v3
-; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
-; GFX8-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
-; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
-; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc
-; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5]
-; GFX8-NEXT: s_cbranch_execnz .LBB17_4
-; GFX8-NEXT: ; %bb.5: ; in Loop: Header=BB17_3 Depth=1
-; GFX8-NEXT: s_mov_b64 exec, s[12:13]
+; GFX8-NEXT: v_alignbit_b32 v0, v5, v0, 16
+; GFX8-NEXT: v_mov_b32_e32 v6, v1
+; GFX8-NEXT: v_mov_b32_e32 v5, v0
+; GFX8-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[4:7], 0 offen glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6
-; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX8-NEXT: v_mov_b32_e32 v6, v4
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7]
-; GFX8-NEXT: s_cbranch_execnz .LBB17_3
-; GFX8-NEXT: ; %bb.6: ; %atomicrmw.end
-; GFX8-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX8-NEXT: v_mov_b32_e32 v0, v4
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1
+; GFX8-NEXT: s_or_b64 s[10:11], vcc, s[10:11]
+; GFX8-NEXT: v_mov_b32_e32 v1, v5
+; GFX8-NEXT: s_andn2_b64 exec, exec, s[10:11]
+; GFX8-NEXT: s_cbranch_execnz .LBB33_1
+; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT: s_or_b64 exec, exec, s[10:11]
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterfall:
+; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_add_i32_e32 v8, vcc, 0x400, v4
-; GFX7-NEXT: s_mov_b64 s[6:7], exec
-; GFX7-NEXT: .LBB17_1: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: v_readfirstlane_b32 s8, v0
-; GFX7-NEXT: v_readfirstlane_b32 s9, v1
-; GFX7-NEXT: v_readfirstlane_b32 s10, v2
-; GFX7-NEXT: v_readfirstlane_b32 s11, v3
-; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
-; GFX7-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
-; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
-; GFX7-NEXT: buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024
-; GFX7-NEXT: ; implicit-def: $vgpr4
-; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5]
-; GFX7-NEXT: s_cbranch_execnz .LBB17_1
-; GFX7-NEXT: ; %bb.2:
-; GFX7-NEXT: s_mov_b64 exec, s[6:7]
-; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
-; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v5
-; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v6
+; GFX7-NEXT: v_mov_b32_e32 v2, s8
+; GFX7-NEXT: buffer_load_dword v2, v2, s[4:7], 0 offen offset:1024
+; GFX7-NEXT: s_addk_i32 s8, 0x400
+; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GFX7-NEXT: s_mov_b64 s[10:11], 0
+; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v7
-; GFX7-NEXT: v_lshlrev_b32_e32 v7, 16, v7
-; GFX7-NEXT: s_mov_b64 s[6:7], 0
-; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v5
-; GFX7-NEXT: .LBB17_3: ; %atomicrmw.start
-; GFX7-NEXT: ; =>This Loop Header: Depth=1
-; GFX7-NEXT: ; Child Loop BB17_4 Depth 2
-; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v7
-; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v4
-; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v7
-; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v5
-; GFX7-NEXT: v_add_f32_e32 v4, v4, v10
-; GFX7-NEXT: v_add_f32_e32 v6, v6, v9
-; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; GFX7-NEXT: v_alignbit_b32 v4, v4, v6, 16
-; GFX7-NEXT: v_lshrrev_b32_e32 v6, 16, v7
-; GFX7-NEXT: v_alignbit_b32 v5, v6, v5, 16
-; GFX7-NEXT: v_mov_b32_e32 v7, v5
-; GFX7-NEXT: s_mov_b64 s[12:13], exec
+; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
+; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX7-NEXT: v_mov_b32_e32 v2, s8
+; GFX7-NEXT: .LBB33_1: ; %atomicrmw.start
+; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_add_f32_e32 v6, v6, v1
+; GFX7-NEXT: v_add_f32_e32 v5, v5, v0
+; GFX7-NEXT: v_alignbit_b32 v4, v3, v4, 16
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v6
+; GFX7-NEXT: v_alignbit_b32 v3, v3, v5, 16
; GFX7-NEXT: v_mov_b32_e32 v6, v4
-; GFX7-NEXT: .LBB17_4: ; Parent Loop BB17_3 Depth=1
-; GFX7-NEXT: ; => This Inner Loop Header: Depth=2
-; GFX7-NEXT: v_readfirstlane_b32 s8, v0
-; GFX7-NEXT: v_readfirstlane_b32 s9, v1
-; GFX7-NEXT: v_readfirstlane_b32 s10, v2
-; GFX7-NEXT: v_readfirstlane_b32 s11, v3
-; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
-; GFX7-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
-; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v8, s[8:11], 0 offen glc
-; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5]
-; GFX7-NEXT: s_cbranch_execnz .LBB17_4
-; GFX7-NEXT: ; %bb.5: ; in Loop: Header=BB17_3 Depth=1
-; GFX7-NEXT: s_mov_b64 exec, s[12:13]
+; GFX7-NEXT: v_mov_b32_e32 v5, v3
+; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v2, s[4:7], 0 offen glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5
-; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v6
-; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX7-NEXT: v_lshlrev_b32_e32 v7, 16, v6
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7]
-; GFX7-NEXT: s_cbranch_execnz .LBB17_3
-; GFX7-NEXT: ; %bb.6: ; %atomicrmw.end
-; GFX7-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX7-NEXT: v_mov_b32_e32 v0, v7
-; GFX7-NEXT: v_mov_b32_e32 v1, v4
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4
+; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v5
+; GFX7-NEXT: s_or_b64 s[10:11], vcc, s[10:11]
+; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v5
+; GFX7-NEXT: s_andn2_b64 exec, exec, s[10:11]
+; GFX7-NEXT: s_cbranch_execnz .LBB33_1
+; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT: s_or_b64 exec, exec, s[10:11]
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
-; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_v2bf16__offset__waterfall:
+; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_noret_v2bf16__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT: v_add_i32_e32 v8, vcc, 0x400, v4
-; GFX6-NEXT: s_mov_b64 s[6:7], exec
-; GFX6-NEXT: .LBB17_1: ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT: v_readfirstlane_b32 s8, v0
-; GFX6-NEXT: v_readfirstlane_b32 s9, v1
-; GFX6-NEXT: v_readfirstlane_b32 s10, v2
-; GFX6-NEXT: v_readfirstlane_b32 s11, v3
-; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
-; GFX6-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
-; GFX6-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
-; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
-; GFX6-NEXT: buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024
-; GFX6-NEXT: ; implicit-def: $vgpr4
-; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5]
-; GFX6-NEXT: s_cbranch_execnz .LBB17_1
-; GFX6-NEXT: ; %bb.2:
-; GFX6-NEXT: s_mov_b64 exec, s[6:7]
-; GFX6-NEXT: v_mul_f32_e32 v5, 1.0, v5
-; GFX6-NEXT: v_and_b32_e32 v9, 0xffff0000, v5
-; GFX6-NEXT: v_mul_f32_e32 v5, 1.0, v6
+; GFX6-NEXT: v_mov_b32_e32 v2, s8
+; GFX6-NEXT: buffer_load_dword v2, v2, s[4:7], 0 offen offset:1024
+; GFX6-NEXT: s_addk_i32 s8, 0x400
+; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GFX6-NEXT: s_mov_b64 s[10:11], 0
+; GFX6-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v7
-; GFX6-NEXT: v_lshlrev_b32_e32 v7, 16, v7
-; GFX6-NEXT: s_mov_b64 s[6:7], 0
-; GFX6-NEXT: v_and_b32_e32 v10, 0xffff0000, v5
-; GFX6-NEXT: .LBB17_3: ; %atomicrmw.start
-; GFX6-NEXT: ; =>This Loop Header: Depth=1
-; GFX6-NEXT: ; Child Loop BB17_4 Depth 2
-; GFX6-NEXT: v_mul_f32_e32 v5, 1.0, v7
-; GFX6-NEXT: v_mul_f32_e32 v7, 1.0, v4
-; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v7
-; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v5
-; GFX6-NEXT: v_add_f32_e32 v4, v4, v10
-; GFX6-NEXT: v_add_f32_e32 v6, v6, v9
-; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; GFX6-NEXT: v_alignbit_b32 v4, v4, v6, 16
-; GFX6-NEXT: v_lshrrev_b32_e32 v6, 16, v7
-; GFX6-NEXT: v_alignbit_b32 v5, v6, v5, 16
-; GFX6-NEXT: v_mov_b32_e32 v7, v5
-; GFX6-NEXT: s_mov_b64 s[12:13], exec
+; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
+; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX6-NEXT: v_mov_b32_e32 v2, s8
+; GFX6-NEXT: .LBB33_1: ; %atomicrmw.start
+; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
+; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX6-NEXT: v_add_f32_e32 v6, v6, v1
+; GFX6-NEXT: v_add_f32_e32 v5, v5, v0
+; GFX6-NEXT: v_alignbit_b32 v4, v3, v4, 16
+; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v6
+; GFX6-NEXT: v_alignbit_b32 v3, v3, v5, 16
; GFX6-NEXT: v_mov_b32_e32 v6, v4
-; GFX6-NEXT: .LBB17_4: ; Parent Loop BB17_3 Depth=1
-; GFX6-NEXT: ; => This Inner Loop Header: Depth=2
-; GFX6-NEXT: v_readfirstlane_b32 s8, v0
-; GFX6-NEXT: v_readfirstlane_b32 s9, v1
-; GFX6-NEXT: v_readfirstlane_b32 s10, v2
-; GFX6-NEXT: v_readfirstlane_b32 s11, v3
-; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
-; GFX6-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
-; GFX6-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
-; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
-; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0)
-; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v8, s[8:11], 0 offen glc
-; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5]
-; GFX6-NEXT: s_cbranch_execnz .LBB17_4
-; GFX6-NEXT: ; %bb.5: ; in Loop: Header=BB17_3 Depth=1
-; GFX6-NEXT: s_mov_b64 exec, s[12:13]
+; GFX6-NEXT: v_mov_b32_e32 v5, v3
+; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v2, s[4:7], 0 offen glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5
-; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v6
-; GFX6-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_lshlrev_b32_e32 v7, 16, v6
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: s_andn2_b64 exec, exec, s[6:7]
-; GFX6-NEXT: s_cbranch_execnz .LBB17_3
-; GFX6-NEXT: ; %bb.6: ; %atomicrmw.end
-; GFX6-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX6-NEXT: v_mov_b32_e32 v0, v7
-; GFX6-NEXT: v_mov_b32_e32 v1, v4
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4
+; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v5
+; GFX6-NEXT: s_or_b64 s[10:11], vcc, s[10:11]
+; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v5
+; GFX6-NEXT: s_andn2_b64 exec, exec, s[10:11]
+; GFX6-NEXT: s_cbranch_execnz .LBB33_1
+; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX6-NEXT: s_or_b64 exec, exec, s[10:11]
+; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr <2 x bfloat>, ptr addrspace(7) %ptr, i32 256
- %result = atomicrmw fadd ptr addrspace(7) %gep, <2 x bfloat> %val syncscope("agent") seq_cst
- ret <2 x bfloat> %result
+ %unused = atomicrmw fadd ptr addrspace(7) %gep, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
+ ret void
}
; --------------------------------------------------------------------
; misc
; --------------------------------------------------------------------
-define float @buffer_fat_ptr_system_atomic_fadd_ret_f32__offset(ptr addrspace(7) inreg %ptr, float %val) #0 {
-; GFX12-LABEL: buffer_fat_ptr_system_atomic_fadd_ret_f32__offset:
+define float @buffer_fat_ptr_system_atomic_fadd_ret_f32__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, float %val) #0 {
+; GFX12-LABEL: buffer_fat_ptr_system_atomic_fadd_ret_f32__offset__amdgpu_no_fine_grained_memory:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: s_wait_expcnt 0x0
@@ -7036,7 +11385,7 @@ define float @buffer_fat_ptr_system_atomic_fadd_ret_f32__offset(ptr addrspace(7)
; GFX12-NEXT: v_mov_b32_e32 v3, s4
; GFX12-NEXT: s_mov_b32 s4, 0
; GFX12-NEXT: buffer_load_b32 v0, v0, s[0:3], null offen offset:1024
-; GFX12-NEXT: .LBB18_1: ; %atomicrmw.start
+; GFX12-NEXT: .LBB34_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: v_mov_b32_e32 v5, v0
@@ -7051,12 +11400,12 @@ define float @buffer_fat_ptr_system_atomic_fadd_ret_f32__offset(ptr addrspace(7)
; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
-; GFX12-NEXT: s_cbranch_execnz .LBB18_1
+; GFX12-NEXT: s_cbranch_execnz .LBB34_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
-; GFX940-LABEL: buffer_fat_ptr_system_atomic_fadd_ret_f32__offset:
+; GFX940-LABEL: buffer_fat_ptr_system_atomic_fadd_ret_f32__offset__amdgpu_no_fine_grained_memory:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: v_mov_b32_e32 v1, s4
@@ -7066,7 +11415,7 @@ define float @buffer_fat_ptr_system_atomic_fadd_ret_f32__offset(ptr addrspace(7)
; GFX940-NEXT: buffer_inv sc0 sc1
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: buffer_fat_ptr_system_atomic_fadd_ret_f32__offset:
+; GFX11-LABEL: buffer_fat_ptr_system_atomic_fadd_ret_f32__offset__amdgpu_no_fine_grained_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v2, v0
@@ -7076,7 +11425,7 @@ define float @buffer_fat_ptr_system_atomic_fadd_ret_f32__offset(ptr addrspace(7)
; GFX11-NEXT: v_mov_b32_e32 v3, s4
; GFX11-NEXT: s_mov_b32 s4, 0
; GFX11-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024
-; GFX11-NEXT: .LBB18_1: ; %atomicrmw.start
+; GFX11-NEXT: .LBB34_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v5, v0
@@ -7092,12 +11441,12 @@ define float @buffer_fat_ptr_system_atomic_fadd_ret_f32__offset(ptr addrspace(7)
; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
-; GFX11-NEXT: s_cbranch_execnz .LBB18_1
+; GFX11-NEXT: s_cbranch_execnz .LBB34_1
; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
-; GFX10-LABEL: buffer_fat_ptr_system_atomic_fadd_ret_f32__offset:
+; GFX10-LABEL: buffer_fat_ptr_system_atomic_fadd_ret_f32__offset__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v2, v0
@@ -7106,7 +11455,7 @@ define float @buffer_fat_ptr_system_atomic_fadd_ret_f32__offset(ptr addrspace(7)
; GFX10-NEXT: v_mov_b32_e32 v3, s8
; GFX10-NEXT: s_mov_b32 s8, 0
; GFX10-NEXT: buffer_load_dword v0, v0, s[4:7], 0 offen offset:1024
-; GFX10-NEXT: .LBB18_1: ; %atomicrmw.start
+; GFX10-NEXT: .LBB34_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v5, v0
@@ -7121,12 +11470,12 @@ define float @buffer_fat_ptr_system_atomic_fadd_ret_f32__offset(ptr addrspace(7)
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5
; GFX10-NEXT: s_or_b32 s8, vcc_lo, s8
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s8
-; GFX10-NEXT: s_cbranch_execnz .LBB18_1
+; GFX10-NEXT: s_cbranch_execnz .LBB34_1
; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s8
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX90A-LABEL: buffer_fat_ptr_system_atomic_fadd_ret_f32__offset:
+; GFX90A-LABEL: buffer_fat_ptr_system_atomic_fadd_ret_f32__offset__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v2, v0
@@ -7135,7 +11484,7 @@ define float @buffer_fat_ptr_system_atomic_fadd_ret_f32__offset(ptr addrspace(7)
; GFX90A-NEXT: s_add_i32 s10, s8, 0x400
; GFX90A-NEXT: s_mov_b64 s[8:9], 0
; GFX90A-NEXT: v_mov_b32_e32 v3, s10
-; GFX90A-NEXT: .LBB18_1: ; %atomicrmw.start
+; GFX90A-NEXT: .LBB34_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v5, v0
@@ -7149,12 +11498,12 @@ define float @buffer_fat_ptr_system_atomic_fadd_ret_f32__offset(ptr addrspace(7)
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
; GFX90A-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[8:9]
-; GFX90A-NEXT: s_cbranch_execnz .LBB18_1
+; GFX90A-NEXT: s_cbranch_execnz .LBB34_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
-; GFX908-LABEL: buffer_fat_ptr_system_atomic_fadd_ret_f32__offset:
+; GFX908-LABEL: buffer_fat_ptr_system_atomic_fadd_ret_f32__offset__amdgpu_no_fine_grained_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: v_mov_b32_e32 v2, v0
@@ -7163,7 +11512,7 @@ define float @buffer_fat_ptr_system_atomic_fadd_ret_f32__offset(ptr addrspace(7)
; GFX908-NEXT: s_add_i32 s10, s8, 0x400
; GFX908-NEXT: s_mov_b64 s[8:9], 0
; GFX908-NEXT: v_mov_b32_e32 v3, s10
-; GFX908-NEXT: .LBB18_1: ; %atomicrmw.start
+; GFX908-NEXT: .LBB34_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: v_mov_b32_e32 v5, v0
@@ -7176,12 +11525,12 @@ define float @buffer_fat_ptr_system_atomic_fadd_ret_f32__offset(ptr addrspace(7)
; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
; GFX908-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
; GFX908-NEXT: s_andn2_b64 exec, exec, s[8:9]
-; GFX908-NEXT: s_cbranch_execnz .LBB18_1
+; GFX908-NEXT: s_cbranch_execnz .LBB34_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX908-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
-; GFX8-LABEL: buffer_fat_ptr_system_atomic_fadd_ret_f32__offset:
+; GFX8-LABEL: buffer_fat_ptr_system_atomic_fadd_ret_f32__offset__amdgpu_no_fine_grained_memory:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v2, v0
@@ -7190,7 +11539,7 @@ define float @buffer_fat_ptr_system_atomic_fadd_ret_f32__offset(ptr addrspace(7)
; GFX8-NEXT: s_add_i32 s10, s8, 0x400
; GFX8-NEXT: s_mov_b64 s[8:9], 0
; GFX8-NEXT: v_mov_b32_e32 v3, s10
-; GFX8-NEXT: .LBB18_1: ; %atomicrmw.start
+; GFX8-NEXT: .LBB34_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v5, v0
@@ -7203,12 +11552,12 @@ define float @buffer_fat_ptr_system_atomic_fadd_ret_f32__offset(ptr addrspace(7)
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
; GFX8-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
; GFX8-NEXT: s_andn2_b64 exec, exec, s[8:9]
-; GFX8-NEXT: s_cbranch_execnz .LBB18_1
+; GFX8-NEXT: s_cbranch_execnz .LBB34_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX8-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX7-LABEL: buffer_fat_ptr_system_atomic_fadd_ret_f32__offset:
+; GFX7-LABEL: buffer_fat_ptr_system_atomic_fadd_ret_f32__offset__amdgpu_no_fine_grained_memory:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v2, v0
@@ -7217,7 +11566,7 @@ define float @buffer_fat_ptr_system_atomic_fadd_ret_f32__offset(ptr addrspace(7)
; GFX7-NEXT: s_add_i32 s10, s8, 0x400
; GFX7-NEXT: s_mov_b64 s[8:9], 0
; GFX7-NEXT: v_mov_b32_e32 v3, s10
-; GFX7-NEXT: .LBB18_1: ; %atomicrmw.start
+; GFX7-NEXT: .LBB34_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v5, v0
@@ -7230,12 +11579,12 @@ define float @buffer_fat_ptr_system_atomic_fadd_ret_f32__offset(ptr addrspace(7)
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT: s_cbranch_execnz .LBB18_1
+; GFX7-NEXT: s_cbranch_execnz .LBB34_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
-; GFX6-LABEL: buffer_fat_ptr_system_atomic_fadd_ret_f32__offset:
+; GFX6-LABEL: buffer_fat_ptr_system_atomic_fadd_ret_f32__offset__amdgpu_no_fine_grained_memory:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: v_mov_b32_e32 v2, v0
@@ -7244,7 +11593,7 @@ define float @buffer_fat_ptr_system_atomic_fadd_ret_f32__offset(ptr addrspace(7)
; GFX6-NEXT: s_add_i32 s10, s8, 0x400
; GFX6-NEXT: s_mov_b64 s[8:9], 0
; GFX6-NEXT: v_mov_b32_e32 v3, s10
-; GFX6-NEXT: .LBB18_1: ; %atomicrmw.start
+; GFX6-NEXT: .LBB34_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: v_mov_b32_e32 v5, v0
@@ -7258,16 +11607,16 @@ define float @buffer_fat_ptr_system_atomic_fadd_ret_f32__offset(ptr addrspace(7)
; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT: s_cbranch_execnz .LBB18_1
+; GFX6-NEXT: s_cbranch_execnz .LBB34_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr float, ptr addrspace(7) %ptr, i32 256
- %result = atomicrmw fadd ptr addrspace(7) %gep, float %val seq_cst
+ %result = atomicrmw fadd ptr addrspace(7) %gep, float %val seq_cst, !amdgpu.no.fine.grained.memory !0
ret float %result
}
attributes #0 = { nounwind "amdgpu-unsafe-fp-atomics"="true" }
-
+!0 = !{}
diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll
index 458f966b0f75f..af7e11127fbae 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll
@@ -12,8 +12,8 @@
; float
; --------------------------------------------------------------------
-define float @flat_agent_atomic_fadd_ret_f32(ptr %ptr, float %val) #0 {
-; GFX12-LABEL: flat_agent_atomic_fadd_ret_f32:
+define float @flat_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode(ptr %ptr, float %val) #0 {
+; GFX12-LABEL: flat_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: s_wait_expcnt 0x0
@@ -26,7 +26,7 @@ define float @flat_agent_atomic_fadd_ret_f32(ptr %ptr, float %val) #0 {
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
-; GFX940-LABEL: flat_agent_atomic_fadd_ret_f32:
+; GFX940-LABEL: flat_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_wbl2 sc1
@@ -35,7 +35,7 @@ define float @flat_agent_atomic_fadd_ret_f32(ptr %ptr, float %val) #0 {
; GFX940-NEXT: buffer_inv sc1
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: flat_agent_atomic_fadd_ret_f32:
+; GFX11-LABEL: flat_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
@@ -45,7 +45,7 @@ define float @flat_agent_atomic_fadd_ret_f32(ptr %ptr, float %val) #0 {
; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
-; GFX10-LABEL: flat_agent_atomic_fadd_ret_f32:
+; GFX10-LABEL: flat_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: flat_load_dword v3, v[0:1]
@@ -69,7 +69,7 @@ define float @flat_agent_atomic_fadd_ret_f32(ptr %ptr, float %val) #0 {
; GFX10-NEXT: v_mov_b32_e32 v0, v3
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX90A-LABEL: flat_agent_atomic_fadd_ret_f32:
+; GFX90A-LABEL: flat_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: s_mov_b64 s[4:5], src_shared_base
@@ -119,7 +119,7 @@ define float @flat_agent_atomic_fadd_ret_f32(ptr %ptr, float %val) #0 {
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
-; GFX908-LABEL: flat_agent_atomic_fadd_ret_f32:
+; GFX908-LABEL: flat_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: flat_load_dword v3, v[0:1]
@@ -141,7 +141,7 @@ define float @flat_agent_atomic_fadd_ret_f32(ptr %ptr, float %val) #0 {
; GFX908-NEXT: v_mov_b32_e32 v0, v3
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
-; GFX8-LABEL: flat_agent_atomic_fadd_ret_f32:
+; GFX8-LABEL: flat_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: flat_load_dword v3, v[0:1]
@@ -163,7 +163,7 @@ define float @flat_agent_atomic_fadd_ret_f32(ptr %ptr, float %val) #0 {
; GFX8-NEXT: v_mov_b32_e32 v0, v3
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX7-LABEL: flat_agent_atomic_fadd_ret_f32:
+; GFX7-LABEL: flat_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: flat_load_dword v3, v[0:1]
@@ -184,12 +184,12 @@ define float @flat_agent_atomic_fadd_ret_f32(ptr %ptr, float %val) #0 {
; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX7-NEXT: v_mov_b32_e32 v0, v3
; GFX7-NEXT: s_setpc_b64 s[30:31]
- %result = atomicrmw fadd ptr %ptr, float %val syncscope("agent") seq_cst
+ %result = atomicrmw fadd ptr %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0
ret float %result
}
-define float @flat_agent_atomic_fadd_ret_f32__offset12b_pos(ptr %ptr, float %val) #0 {
-; GFX12-LABEL: flat_agent_atomic_fadd_ret_f32__offset12b_pos:
+define float @flat_agent_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode(ptr %ptr, float %val) #0 {
+; GFX12-LABEL: flat_agent_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: s_wait_expcnt 0x0
@@ -202,7 +202,7 @@ define float @flat_agent_atomic_fadd_ret_f32__offset12b_pos(ptr %ptr, float %val
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
-; GFX940-LABEL: flat_agent_atomic_fadd_ret_f32__offset12b_pos:
+; GFX940-LABEL: flat_agent_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_wbl2 sc1
@@ -211,7 +211,7 @@ define float @flat_agent_atomic_fadd_ret_f32__offset12b_pos(ptr %ptr, float %val
; GFX940-NEXT: buffer_inv sc1
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: flat_agent_atomic_fadd_ret_f32__offset12b_pos:
+; GFX11-LABEL: flat_agent_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
@@ -221,7 +221,7 @@ define float @flat_agent_atomic_fadd_ret_f32__offset12b_pos(ptr %ptr, float %val
; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
-; GFX10-LABEL: flat_agent_atomic_fadd_ret_f32__offset12b_pos:
+; GFX10-LABEL: flat_agent_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fc, v0
@@ -246,7 +246,7 @@ define float @flat_agent_atomic_fadd_ret_f32__offset12b_pos(ptr %ptr, float %val
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX90A-LABEL: flat_agent_atomic_fadd_ret_f32__offset12b_pos:
+; GFX90A-LABEL: flat_agent_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fc, v0
@@ -302,7 +302,7 @@ define float @flat_agent_atomic_fadd_ret_f32__offset12b_pos(ptr %ptr, float %val
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
-; GFX908-LABEL: flat_agent_atomic_fadd_ret_f32__offset12b_pos:
+; GFX908-LABEL: flat_agent_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044
@@ -324,7 +324,7 @@ define float @flat_agent_atomic_fadd_ret_f32__offset12b_pos(ptr %ptr, float %val
; GFX908-NEXT: v_mov_b32_e32 v0, v3
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
-; GFX8-LABEL: flat_agent_atomic_fadd_ret_f32__offset12b_pos:
+; GFX8-LABEL: flat_agent_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fc, v0
@@ -347,7 +347,7 @@ define float @flat_agent_atomic_fadd_ret_f32__offset12b_pos(ptr %ptr, float %val
; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX7-LABEL: flat_agent_atomic_fadd_ret_f32__offset12b_pos:
+; GFX7-LABEL: flat_agent_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_add_i32_e32 v3, vcc, 0x7fc, v0
@@ -370,12 +370,12 @@ define float @flat_agent_atomic_fadd_ret_f32__offset12b_pos(ptr %ptr, float %val
; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr float, ptr %ptr, i64 511
- %result = atomicrmw fadd ptr %gep, float %val syncscope("agent") seq_cst
+ %result = atomicrmw fadd ptr %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0
ret float %result
}
-define float @flat_agent_atomic_fadd_ret_f32__offset12b_neg(ptr %ptr, float %val) #0 {
-; GFX12-LABEL: flat_agent_atomic_fadd_ret_f32__offset12b_neg:
+define float @flat_agent_atomic_fadd_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode(ptr %ptr, float %val) #0 {
+; GFX12-LABEL: flat_agent_atomic_fadd_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: s_wait_expcnt 0x0
@@ -388,7 +388,7 @@ define float @flat_agent_atomic_fadd_ret_f32__offset12b_neg(ptr %ptr, float %val
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
-; GFX940-LABEL: flat_agent_atomic_fadd_ret_f32__offset12b_neg:
+; GFX940-LABEL: flat_agent_atomic_fadd_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
@@ -400,7 +400,7 @@ define float @flat_agent_atomic_fadd_ret_f32__offset12b_neg(ptr %ptr, float %val
; GFX940-NEXT: buffer_inv sc1
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: flat_agent_atomic_fadd_ret_f32__offset12b_neg:
+; GFX11-LABEL: flat_agent_atomic_fadd_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
@@ -412,7 +412,7 @@ define float @flat_agent_atomic_fadd_ret_f32__offset12b_neg(ptr %ptr, float %val
; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
-; GFX10-LABEL: flat_agent_atomic_fadd_ret_f32__offset12b_neg:
+; GFX10-LABEL: flat_agent_atomic_fadd_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
@@ -437,7 +437,7 @@ define float @flat_agent_atomic_fadd_ret_f32__offset12b_neg(ptr %ptr, float %val
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX90A-LABEL: flat_agent_atomic_fadd_ret_f32__offset12b_neg:
+; GFX90A-LABEL: flat_agent_atomic_fadd_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
@@ -493,7 +493,7 @@ define float @flat_agent_atomic_fadd_ret_f32__offset12b_neg(ptr %ptr, float %val
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
-; GFX908-LABEL: flat_agent_atomic_fadd_ret_f32__offset12b_neg:
+; GFX908-LABEL: flat_agent_atomic_fadd_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
@@ -518,7 +518,7 @@ define float @flat_agent_atomic_fadd_ret_f32__offset12b_neg(ptr %ptr, float %val
; GFX908-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
-; GFX8-LABEL: flat_agent_atomic_fadd_ret_f32__offset12b_neg:
+; GFX8-LABEL: flat_agent_atomic_fadd_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0xfffff800, v0
@@ -541,7 +541,7 @@ define float @flat_agent_atomic_fadd_ret_f32__offset12b_neg(ptr %ptr, float %val
; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX7-LABEL: flat_agent_atomic_fadd_ret_f32__offset12b_neg:
+; GFX7-LABEL: flat_agent_atomic_fadd_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_add_i32_e32 v3, vcc, 0xfffff800, v0
@@ -564,12 +564,12 @@ define float @flat_agent_atomic_fadd_ret_f32__offset12b_neg(ptr %ptr, float %val
; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr float, ptr %ptr, i64 -512
- %result = atomicrmw fadd ptr %gep, float %val syncscope("agent") seq_cst
+ %result = atomicrmw fadd ptr %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0
ret float %result
}
-define void @flat_agent_atomic_fadd_noret_f32(ptr %ptr, float %val) #0 {
-; GFX12-LABEL: flat_agent_atomic_fadd_noret_f32:
+define void @flat_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode(ptr %ptr, float %val) #0 {
+; GFX12-LABEL: flat_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: s_wait_expcnt 0x0
@@ -582,7 +582,7 @@ define void @flat_agent_atomic_fadd_noret_f32(ptr %ptr, float %val) #0 {
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
-; GFX940-LABEL: flat_agent_atomic_fadd_noret_f32:
+; GFX940-LABEL: flat_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_wbl2 sc1
@@ -591,7 +591,7 @@ define void @flat_agent_atomic_fadd_noret_f32(ptr %ptr, float %val) #0 {
; GFX940-NEXT: buffer_inv sc1
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: flat_agent_atomic_fadd_noret_f32:
+; GFX11-LABEL: flat_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
@@ -602,7 +602,7 @@ define void @flat_agent_atomic_fadd_noret_f32(ptr %ptr, float %val) #0 {
; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
-; GFX10-LABEL: flat_agent_atomic_fadd_noret_f32:
+; GFX10-LABEL: flat_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: flat_load_dword v4, v[0:1]
@@ -625,7 +625,7 @@ define void @flat_agent_atomic_fadd_noret_f32(ptr %ptr, float %val) #0 {
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX90A-LABEL: flat_agent_atomic_fadd_noret_f32:
+; GFX90A-LABEL: flat_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
; GFX90A: ; %bb.0: ; %atomicrmw.check.shared
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: s_mov_b64 s[4:5], src_shared_base
@@ -677,7 +677,7 @@ define void @flat_agent_atomic_fadd_noret_f32(ptr %ptr, float %val) #0 {
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
-; GFX908-LABEL: flat_agent_atomic_fadd_noret_f32:
+; GFX908-LABEL: flat_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
; GFX908: ; %bb.0: ; %atomicrmw.check.shared
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: s_mov_b64 s[4:5], src_shared_base
@@ -729,7 +729,7 @@ define void @flat_agent_atomic_fadd_noret_f32(ptr %ptr, float %val) #0 {
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
-; GFX8-LABEL: flat_agent_atomic_fadd_noret_f32:
+; GFX8-LABEL: flat_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: flat_load_dword v4, v[0:1]
@@ -750,7 +750,7 @@ define void @flat_agent_atomic_fadd_noret_f32(ptr %ptr, float %val) #0 {
; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX7-LABEL: flat_agent_atomic_fadd_noret_f32:
+; GFX7-LABEL: flat_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: flat_load_dword v4, v[0:1]
@@ -770,12 +770,12 @@ define void @flat_agent_atomic_fadd_noret_f32(ptr %ptr, float %val) #0 {
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_setpc_b64 s[30:31]
- %unused = atomicrmw fadd ptr %ptr, float %val syncscope("agent") seq_cst
+ %unused = atomicrmw fadd ptr %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0
ret void
}
-define void @flat_agent_atomic_fadd_noret_f32__offset12b_pos(ptr %ptr, float %val) #0 {
-; GFX12-LABEL: flat_agent_atomic_fadd_noret_f32__offset12b_pos:
+define void @flat_agent_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode(ptr %ptr, float %val) #0 {
+; GFX12-LABEL: flat_agent_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: s_wait_expcnt 0x0
@@ -788,7 +788,7 @@ define void @flat_agent_atomic_fadd_noret_f32__offset12b_pos(ptr %ptr, float %va
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
-; GFX940-LABEL: flat_agent_atomic_fadd_noret_f32__offset12b_pos:
+; GFX940-LABEL: flat_agent_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_wbl2 sc1
@@ -797,7 +797,7 @@ define void @flat_agent_atomic_fadd_noret_f32__offset12b_pos(ptr %ptr, float %va
; GFX940-NEXT: buffer_inv sc1
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: flat_agent_atomic_fadd_noret_f32__offset12b_pos:
+; GFX11-LABEL: flat_agent_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
@@ -808,7 +808,7 @@ define void @flat_agent_atomic_fadd_noret_f32__offset12b_pos(ptr %ptr, float %va
; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
-; GFX10-LABEL: flat_agent_atomic_fadd_noret_f32__offset12b_pos:
+; GFX10-LABEL: flat_agent_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fc, v0
@@ -833,7 +833,7 @@ define void @flat_agent_atomic_fadd_noret_f32__offset12b_pos(ptr %ptr, float %va
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX90A-LABEL: flat_agent_atomic_fadd_noret_f32__offset12b_pos:
+; GFX90A-LABEL: flat_agent_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
; GFX90A: ; %bb.0: ; %atomicrmw.check.shared
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0x7fc, v0
@@ -887,7 +887,7 @@ define void @flat_agent_atomic_fadd_noret_f32__offset12b_pos(ptr %ptr, float %va
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
-; GFX908-LABEL: flat_agent_atomic_fadd_noret_f32__offset12b_pos:
+; GFX908-LABEL: flat_agent_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
; GFX908: ; %bb.0: ; %atomicrmw.check.shared
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: v_add_co_u32_e32 v0, vcc, 0x7fc, v0
@@ -941,7 +941,7 @@ define void @flat_agent_atomic_fadd_noret_f32__offset12b_pos(ptr %ptr, float %va
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
-; GFX8-LABEL: flat_agent_atomic_fadd_noret_f32__offset12b_pos:
+; GFX8-LABEL: flat_agent_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0
@@ -964,7 +964,7 @@ define void @flat_agent_atomic_fadd_noret_f32__offset12b_pos(ptr %ptr, float %va
; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX7-LABEL: flat_agent_atomic_fadd_noret_f32__offset12b_pos:
+; GFX7-LABEL: flat_agent_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7fc, v0
@@ -987,12 +987,12 @@ define void @flat_agent_atomic_fadd_noret_f32__offset12b_pos(ptr %ptr, float %va
; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr float, ptr %ptr, i64 511
- %unused = atomicrmw fadd ptr %gep, float %val syncscope("agent") seq_cst
+ %unused = atomicrmw fadd ptr %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0
ret void
}
-define void @flat_agent_atomic_fadd_noret_f32__offset12b_neg(ptr %ptr, float %val) #0 {
-; GFX12-LABEL: flat_agent_atomic_fadd_noret_f32__offset12b_neg:
+define void @flat_agent_atomic_fadd_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode(ptr %ptr, float %val) #0 {
+; GFX12-LABEL: flat_agent_atomic_fadd_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: s_wait_expcnt 0x0
@@ -1005,7 +1005,7 @@ define void @flat_agent_atomic_fadd_noret_f32__offset12b_neg(ptr %ptr, float %va
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
-; GFX940-LABEL: flat_agent_atomic_fadd_noret_f32__offset12b_neg:
+; GFX940-LABEL: flat_agent_atomic_fadd_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
@@ -1017,7 +1017,7 @@ define void @flat_agent_atomic_fadd_noret_f32__offset12b_neg(ptr %ptr, float %va
; GFX940-NEXT: buffer_inv sc1
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: flat_agent_atomic_fadd_noret_f32__offset12b_neg:
+; GFX11-LABEL: flat_agent_atomic_fadd_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
@@ -1030,7 +1030,7 @@ define void @flat_agent_atomic_fadd_noret_f32__offset12b_neg(ptr %ptr, float %va
; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
-; GFX10-LABEL: flat_agent_atomic_fadd_noret_f32__offset12b_neg:
+; GFX10-LABEL: flat_agent_atomic_fadd_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
@@ -1055,7 +1055,7 @@ define void @flat_agent_atomic_fadd_noret_f32__offset12b_neg(ptr %ptr, float %va
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX90A-LABEL: flat_agent_atomic_fadd_noret_f32__offset12b_neg:
+; GFX90A-LABEL: flat_agent_atomic_fadd_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
; GFX90A: ; %bb.0: ; %atomicrmw.check.shared
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
@@ -1109,7 +1109,7 @@ define void @flat_agent_atomic_fadd_noret_f32__offset12b_neg(ptr %ptr, float %va
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
-; GFX908-LABEL: flat_agent_atomic_fadd_noret_f32__offset12b_neg:
+; GFX908-LABEL: flat_agent_atomic_fadd_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
; GFX908: ; %bb.0: ; %atomicrmw.check.shared
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
@@ -1163,7 +1163,7 @@ define void @flat_agent_atomic_fadd_noret_f32__offset12b_neg(ptr %ptr, float %va
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
-; GFX8-LABEL: flat_agent_atomic_fadd_noret_f32__offset12b_neg:
+; GFX8-LABEL: flat_agent_atomic_fadd_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0xfffff800, v0
@@ -1186,7 +1186,7 @@ define void @flat_agent_atomic_fadd_noret_f32__offset12b_neg(ptr %ptr, float %va
; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX7-LABEL: flat_agent_atomic_fadd_noret_f32__offset12b_neg:
+; GFX7-LABEL: flat_agent_atomic_fadd_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0
@@ -1209,12 +1209,12 @@ define void @flat_agent_atomic_fadd_noret_f32__offset12b_neg(ptr %ptr, float %va
; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr float, ptr %ptr, i64 -512
- %unused = atomicrmw fadd ptr %gep, float %val syncscope("agent") seq_cst
+ %unused = atomicrmw fadd ptr %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0
ret void
}
-define float @flat_system_atomic_fadd_ret_f32__offset12b_pos(ptr %ptr, float %val) #0 {
-; GFX12-LABEL: flat_system_atomic_fadd_ret_f32__offset12b_pos:
+define float @flat_system_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode(ptr %ptr, float %val) #0 {
+; GFX12-LABEL: flat_system_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: s_wait_expcnt 0x0
@@ -1243,7 +1243,7 @@ define float @flat_system_atomic_fadd_ret_f32__offset12b_pos(ptr %ptr, float %va
; GFX12-NEXT: v_mov_b32_e32 v0, v3
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
-; GFX940-LABEL: flat_system_atomic_fadd_ret_f32__offset12b_pos:
+; GFX940-LABEL: flat_system_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_wbl2 sc0 sc1
@@ -1252,7 +1252,7 @@ define float @flat_system_atomic_fadd_ret_f32__offset12b_pos(ptr %ptr, float %va
; GFX940-NEXT: buffer_inv sc0 sc1
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: flat_system_atomic_fadd_ret_f32__offset12b_pos:
+; GFX11-LABEL: flat_system_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2044
@@ -1278,7 +1278,7 @@ define float @flat_system_atomic_fadd_ret_f32__offset12b_pos(ptr %ptr, float %va
; GFX11-NEXT: v_mov_b32_e32 v0, v3
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
-; GFX10-LABEL: flat_system_atomic_fadd_ret_f32__offset12b_pos:
+; GFX10-LABEL: flat_system_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fc, v0
@@ -1303,7 +1303,7 @@ define float @flat_system_atomic_fadd_ret_f32__offset12b_pos(ptr %ptr, float %va
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX90A-LABEL: flat_system_atomic_fadd_ret_f32__offset12b_pos:
+; GFX90A-LABEL: flat_system_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044
@@ -1327,7 +1327,7 @@ define float @flat_system_atomic_fadd_ret_f32__offset12b_pos(ptr %ptr, float %va
; GFX90A-NEXT: v_mov_b32_e32 v0, v3
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
-; GFX908-LABEL: flat_system_atomic_fadd_ret_f32__offset12b_pos:
+; GFX908-LABEL: flat_system_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044
@@ -1349,7 +1349,7 @@ define float @flat_system_atomic_fadd_ret_f32__offset12b_pos(ptr %ptr, float %va
; GFX908-NEXT: v_mov_b32_e32 v0, v3
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
-; GFX8-LABEL: flat_system_atomic_fadd_ret_f32__offset12b_pos:
+; GFX8-LABEL: flat_system_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fc, v0
@@ -1372,7 +1372,7 @@ define float @flat_system_atomic_fadd_ret_f32__offset12b_pos(ptr %ptr, float %va
; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX7-LABEL: flat_system_atomic_fadd_ret_f32__offset12b_pos:
+; GFX7-LABEL: flat_system_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_add_i32_e32 v3, vcc, 0x7fc, v0
@@ -1395,12 +1395,12 @@ define float @flat_system_atomic_fadd_ret_f32__offset12b_pos(ptr %ptr, float %va
; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr float, ptr %ptr, i64 511
- %result = atomicrmw fadd ptr %gep, float %val seq_cst
+ %result = atomicrmw fadd ptr %gep, float %val seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0
ret float %result
}
-define void @flat_system_atomic_fadd_noret_f32__offset12b_pos(ptr %ptr, float %val) #0 {
-; GFX12-LABEL: flat_system_atomic_fadd_noret_f32__offset12b_pos:
+define void @flat_system_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode(ptr %ptr, float %val) #0 {
+; GFX12-LABEL: flat_system_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: s_wait_expcnt 0x0
@@ -1427,7 +1427,7 @@ define void @flat_system_atomic_fadd_noret_f32__offset12b_pos(ptr %ptr, float %v
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
-; GFX940-LABEL: flat_system_atomic_fadd_noret_f32__offset12b_pos:
+; GFX940-LABEL: flat_system_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_wbl2 sc0 sc1
@@ -1436,7 +1436,7 @@ define void @flat_system_atomic_fadd_noret_f32__offset12b_pos(ptr %ptr, float %v
; GFX940-NEXT: buffer_inv sc0 sc1
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: flat_system_atomic_fadd_noret_f32__offset12b_pos:
+; GFX11-LABEL: flat_system_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: flat_load_b32 v4, v[0:1] offset:2044
@@ -1460,7 +1460,7 @@ define void @flat_system_atomic_fadd_noret_f32__offset12b_pos(ptr %ptr, float %v
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
-; GFX10-LABEL: flat_system_atomic_fadd_noret_f32__offset12b_pos:
+; GFX10-LABEL: flat_system_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fc, v0
@@ -1485,7 +1485,7 @@ define void @flat_system_atomic_fadd_noret_f32__offset12b_pos(ptr %ptr, float %v
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX90A-LABEL: flat_system_atomic_fadd_noret_f32__offset12b_pos:
+; GFX90A-LABEL: flat_system_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: flat_load_dword v5, v[0:1] offset:2044
@@ -1508,7 +1508,7 @@ define void @flat_system_atomic_fadd_noret_f32__offset12b_pos(ptr %ptr, float %v
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
-; GFX908-LABEL: flat_system_atomic_fadd_noret_f32__offset12b_pos:
+; GFX908-LABEL: flat_system_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: flat_load_dword v4, v[0:1] offset:2044
@@ -1529,7 +1529,7 @@ define void @flat_system_atomic_fadd_noret_f32__offset12b_pos(ptr %ptr, float %v
; GFX908-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
-; GFX8-LABEL: flat_system_atomic_fadd_noret_f32__offset12b_pos:
+; GFX8-LABEL: flat_system_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0
@@ -1552,7 +1552,7 @@ define void @flat_system_atomic_fadd_noret_f32__offset12b_pos(ptr %ptr, float %v
; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX7-LABEL: flat_system_atomic_fadd_noret_f32__offset12b_pos:
+; GFX7-LABEL: flat_system_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7fc, v0
@@ -1575,56 +1575,81 @@ define void @flat_system_atomic_fadd_noret_f32__offset12b_pos(ptr %ptr, float %v
; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr float, ptr %ptr, i64 511
- %unused = atomicrmw fadd ptr %gep, float %val seq_cst
+ %unused = atomicrmw fadd ptr %gep, float %val seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0
ret void
}
-; --------------------------------------------------------------------
-; float with ftz/daz
-; --------------------------------------------------------------------
-
-define float @flat_agent_atomic_fadd_ret_f32__ftz(ptr %ptr, float %val) #1 {
-; GFX12-LABEL: flat_agent_atomic_fadd_ret_f32__ftz:
+define void @flat_agent_atomic_fadd_noret_f32_maybe_remote(ptr %ptr, float %val) {
+; GFX12-LABEL: flat_agent_atomic_fadd_noret_f32_maybe_remote:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: s_wait_expcnt 0x0
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: flat_load_b32 v4, v[0:1] offset:2044
+; GFX12-NEXT: s_mov_b32 s0, 0
+; GFX12-NEXT: .LBB8_1: ; %atomicrmw.start
+; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: v_add_f32_e32 v3, v4, v2
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN
+; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_cbranch_execnz .LBB8_1
+; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
-; GFX940-LABEL: flat_agent_atomic_fadd_ret_f32__ftz:
+; GFX940-LABEL: flat_agent_atomic_fadd_noret_f32_maybe_remote:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 sc0
+; GFX940-NEXT: flat_atomic_add_f32 v[0:1], v2 offset:2044
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_inv sc1
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: flat_agent_atomic_fadd_ret_f32__ftz:
+; GFX11-LABEL: flat_agent_atomic_fadd_noret_f32_maybe_remote:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: flat_load_b32 v4, v[0:1] offset:2044
+; GFX11-NEXT: s_mov_b32 s0, 0
+; GFX11-NEXT: .LBB8_1: ; %atomicrmw.start
+; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_add_f32_e32 v3, v4, v2
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 glc
+; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 glc
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT: s_cbranch_execnz .LBB8_1
+; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
-; GFX10-LABEL: flat_agent_atomic_fadd_ret_f32__ftz:
+; GFX10-LABEL: flat_agent_atomic_fadd_noret_f32_maybe_remote:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: flat_load_dword v3, v[0:1]
+; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fc, v0
+; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX10-NEXT: s_mov_b32 s4, 0
+; GFX10-NEXT: flat_load_dword v4, v[0:1]
; GFX10-NEXT: .LBB8_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v4, v3
; GFX10-NEXT: v_add_f32_e32 v3, v4, v2
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
@@ -1632,135 +1657,108 @@ define float @flat_agent_atomic_fadd_ret_f32__ftz(ptr %ptr, float %val) #1 {
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX10-NEXT: v_mov_b32_e32 v4, v3
; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_cbranch_execnz .LBB8_1
; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: v_mov_b32_e32 v0, v3
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX90A-LABEL: flat_agent_atomic_fadd_ret_f32__ftz:
+; GFX90A-LABEL: flat_agent_atomic_fadd_noret_f32_maybe_remote:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: s_mov_b64 s[4:5], src_shared_base
-; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1
-; GFX90A-NEXT: ; implicit-def: $vgpr3
-; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
-; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
-; GFX90A-NEXT: s_cbranch_execz .LBB8_6
-; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.check.private
-; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base
-; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s7, v1
-; GFX90A-NEXT: ; implicit-def: $vgpr3
-; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc
-; GFX90A-NEXT: s_xor_b64 s[6:7], exec, s[6:7]
-; GFX90A-NEXT: s_cbranch_execz .LBB8_3
-; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.global
-; GFX90A-NEXT: global_atomic_add_f32 v3, v[0:1], v2, off glc
-; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: flat_load_dword v5, v[0:1] offset:2044
+; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: .LBB8_1: ; %atomicrmw.start
+; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_add_f32_e32 v4, v5, v2
+; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc
+; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX90A-NEXT: ; implicit-def: $vgpr2
-; GFX90A-NEXT: .LBB8_3: ; %Flow
-; GFX90A-NEXT: s_andn2_saveexec_b64 s[6:7], s[6:7]
-; GFX90A-NEXT: s_cbranch_execz .LBB8_5
-; GFX90A-NEXT: ; %bb.4: ; %atomicrmw.private
-; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
-; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
-; GFX90A-NEXT: buffer_load_dword v3, v0, s[0:3], 0 offen
-; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_add_f32_e32 v1, v3, v2
-; GFX90A-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
-; GFX90A-NEXT: .LBB8_5: ; %Flow1
-; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX90A-NEXT: ; implicit-def: $vgpr2
-; GFX90A-NEXT: .LBB8_6: ; %Flow2
-; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GFX90A-NEXT: s_cbranch_execz .LBB8_8
-; GFX90A-NEXT: ; %bb.7: ; %atomicrmw.shared
-; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
-; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
-; GFX90A-NEXT: ds_add_rtn_f32 v3, v0, v2
-; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: .LBB8_8: ; %atomicrmw.phi
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v5, v3
+; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: s_cbranch_execnz .LBB8_1
+; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: v_mov_b32_e32 v0, v3
-; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
-; GFX908-LABEL: flat_agent_atomic_fadd_ret_f32__ftz:
+; GFX908-LABEL: flat_agent_atomic_fadd_noret_f32_maybe_remote:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: flat_load_dword v3, v[0:1]
+; GFX908-NEXT: flat_load_dword v4, v[0:1] offset:2044
; GFX908-NEXT: s_mov_b64 s[4:5], 0
; GFX908-NEXT: .LBB8_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v4, v3
; GFX908-NEXT: v_add_f32_e32 v3, v4, v2
-; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT: v_mov_b32_e32 v4, v3
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB8_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX908-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX908-NEXT: v_mov_b32_e32 v0, v3
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
-; GFX8-LABEL: flat_agent_atomic_fadd_ret_f32__ftz:
+; GFX8-LABEL: flat_agent_atomic_fadd_noret_f32_maybe_remote:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: flat_load_dword v3, v[0:1]
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT: flat_load_dword v4, v[0:1]
; GFX8-NEXT: s_mov_b64 s[4:5], 0
; GFX8-NEXT: .LBB8_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v4, v3
; GFX8-NEXT: v_add_f32_e32 v3, v4, v2
; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT: v_mov_b32_e32 v4, v3
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB8_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX8-NEXT: v_mov_b32_e32 v0, v3
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX7-LABEL: flat_agent_atomic_fadd_ret_f32__ftz:
+; GFX7-LABEL: flat_agent_atomic_fadd_noret_f32_maybe_remote:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: flat_load_dword v3, v[0:1]
+; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7fc, v0
+; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX7-NEXT: flat_load_dword v4, v[0:1]
; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: .LBB8_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v4, v3
; GFX7-NEXT: v_add_f32_e32 v3, v4, v2
; GFX7-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT: v_mov_b32_e32 v4, v3
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_cbranch_execnz .LBB8_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX7-NEXT: v_mov_b32_e32 v0, v3
; GFX7-NEXT: s_setpc_b64 s[30:31]
- %result = atomicrmw fadd ptr %ptr, float %val syncscope("agent") seq_cst
- ret float %result
+ %gep = getelementptr float, ptr %ptr, i64 511
+ %unused = atomicrmw fadd ptr %gep, float %val syncscope("agent") seq_cst
+ ret void
}
-define float @flat_agent_atomic_fadd_ret_f32__offset12b_pos__ftz(ptr %ptr, float %val) #1 {
-; GFX12-LABEL: flat_agent_atomic_fadd_ret_f32__offset12b_pos__ftz:
+define void @flat_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory(ptr %ptr, float %val) #0 {
+; GFX12-LABEL: flat_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: s_wait_expcnt 0x0
@@ -1768,48 +1766,49 @@ define float @flat_agent_atomic_fadd_ret_f32__offset12b_pos__ftz(ptr %ptr, float
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 offset:2044 th:TH_ATOMIC_RETURN
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: flat_atomic_add_f32 v[0:1], v2 offset:2044
+; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
-; GFX940-LABEL: flat_agent_atomic_fadd_ret_f32__offset12b_pos__ftz:
+; GFX940-LABEL: flat_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 offset:2044 sc0
+; GFX940-NEXT: flat_atomic_add_f32 v[0:1], v2 offset:2044
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_inv sc1
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: flat_agent_atomic_fadd_ret_f32__offset12b_pos__ftz:
+; GFX11-LABEL: flat_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 offset:2044 glc
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT: flat_atomic_add_f32 v[0:1], v2 offset:2044
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
-; GFX10-LABEL: flat_agent_atomic_fadd_ret_f32__offset12b_pos__ftz:
+; GFX10-LABEL: flat_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fc, v0
-; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, 0, v1, vcc_lo
+; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fc, v0
+; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: flat_load_dword v0, v[3:4]
+; GFX10-NEXT: flat_load_dword v4, v[0:1]
; GFX10-NEXT: .LBB9_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v1, v0
-; GFX10-NEXT: v_add_f32_e32 v0, v1, v2
+; GFX10-NEXT: v_add_f32_e32 v3, v4, v2
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
+; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX10-NEXT: v_mov_b32_e32 v4, v3
; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_cbranch_execnz .LBB9_1
@@ -1817,14 +1816,13 @@ define float @flat_agent_atomic_fadd_ret_f32__offset12b_pos__ftz(ptr %ptr, float
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX90A-LABEL: flat_agent_atomic_fadd_ret_f32__offset12b_pos__ftz:
-; GFX90A: ; %bb.0:
+; GFX90A-LABEL: flat_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory:
+; GFX90A: ; %bb.0: ; %atomicrmw.check.shared
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fc, v0
-; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
+; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0x7fc, v0
+; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX90A-NEXT: s_mov_b64 s[4:5], src_shared_base
-; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5
-; GFX90A-NEXT: ; implicit-def: $vgpr0
+; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB9_3
@@ -1837,116 +1835,147 @@ define float @flat_agent_atomic_fadd_ret_f32__offset12b_pos__ftz(ptr %ptr, float
; GFX90A-NEXT: s_setpc_b64 s[30:31]
; GFX90A-NEXT: .LBB9_3: ; %atomicrmw.check.private
; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base
-; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s7, v5
-; GFX90A-NEXT: ; implicit-def: $vgpr0
+; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s7, v1
; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc
; GFX90A-NEXT: s_xor_b64 s[6:7], exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execz .LBB9_5
; GFX90A-NEXT: ; %bb.4: ; %atomicrmw.global
-; GFX90A-NEXT: global_atomic_add_f32 v0, v[4:5], v2, off glc
+; GFX90A-NEXT: global_atomic_add_f32 v[0:1], v2, off
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5
+; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX90A-NEXT: ; implicit-def: $vgpr2
; GFX90A-NEXT: .LBB9_5: ; %Flow
; GFX90A-NEXT: s_andn2_saveexec_b64 s[6:7], s[6:7]
; GFX90A-NEXT: s_cbranch_execz .LBB9_7
; GFX90A-NEXT: ; %bb.6: ; %atomicrmw.private
-; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
-; GFX90A-NEXT: v_cndmask_b32_e32 v1, -1, v4, vcc
-; GFX90A-NEXT: buffer_load_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
+; GFX90A-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_add_f32_e32 v2, v0, v2
-; GFX90A-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen
+; GFX90A-NEXT: v_add_f32_e32 v1, v1, v2
+; GFX90A-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
; GFX90A-NEXT: .LBB9_7: ; %Flow1
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5
+; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX90A-NEXT: ; implicit-def: $vgpr2
; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB9_2
; GFX90A-NEXT: .LBB9_8: ; %atomicrmw.shared
-; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
-; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v4, vcc
-; GFX90A-NEXT: ds_add_rtn_f32 v0, v0, v2
+; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
+; GFX90A-NEXT: ds_add_f32 v0, v2
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
-; GFX908-LABEL: flat_agent_atomic_fadd_ret_f32__offset12b_pos__ftz:
-; GFX908: ; %bb.0:
+; GFX908-LABEL: flat_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory:
+; GFX908: ; %bb.0: ; %atomicrmw.check.shared
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044
-; GFX908-NEXT: s_mov_b64 s[4:5], 0
-; GFX908-NEXT: .LBB9_1: ; %atomicrmw.start
-; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v4, v3
-; GFX908-NEXT: v_add_f32_e32 v3, v4, v2
-; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc
-; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX908-NEXT: v_add_co_u32_e32 v0, vcc, 0x7fc, v0
+; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX908-NEXT: s_mov_b64 s[4:5], src_shared_base
+; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1
+; GFX908-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX908-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
+; GFX908-NEXT: s_cbranch_execnz .LBB9_3
+; GFX908-NEXT: ; %bb.1: ; %Flow2
+; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
+; GFX908-NEXT: s_cbranch_execnz .LBB9_8
+; GFX908-NEXT: .LBB9_2: ; %atomicrmw.phi
+; GFX908-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: s_setpc_b64 s[30:31]
+; GFX908-NEXT: .LBB9_3: ; %atomicrmw.check.private
+; GFX908-NEXT: s_mov_b64 s[6:7], src_private_base
+; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s7, v1
+; GFX908-NEXT: s_and_saveexec_b64 s[6:7], vcc
+; GFX908-NEXT: s_xor_b64 s[6:7], exec, s[6:7]
+; GFX908-NEXT: s_cbranch_execz .LBB9_5
+; GFX908-NEXT: ; %bb.4: ; %atomicrmw.global
+; GFX908-NEXT: global_atomic_add_f32 v[0:1], v2, off
+; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
-; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT: s_cbranch_execnz .LBB9_1
-; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX908-NEXT: ; implicit-def: $vgpr2
+; GFX908-NEXT: .LBB9_5: ; %Flow
+; GFX908-NEXT: s_andn2_saveexec_b64 s[6:7], s[6:7]
+; GFX908-NEXT: s_cbranch_execz .LBB9_7
+; GFX908-NEXT: ; %bb.6: ; %atomicrmw.private
+; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; GFX908-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
+; GFX908-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: v_add_f32_e32 v1, v1, v2
+; GFX908-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
+; GFX908-NEXT: .LBB9_7: ; %Flow1
+; GFX908-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX908-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX908-NEXT: ; implicit-def: $vgpr2
+; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
+; GFX908-NEXT: s_cbranch_execz .LBB9_2
+; GFX908-NEXT: .LBB9_8: ; %atomicrmw.shared
+; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; GFX908-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
+; GFX908-NEXT: ds_add_f32 v0, v2
+; GFX908-NEXT: s_waitcnt lgkmcnt(0)
; GFX908-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX908-NEXT: v_mov_b32_e32 v0, v3
+; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
-; GFX8-LABEL: flat_agent_atomic_fadd_ret_f32__offset12b_pos__ftz:
+; GFX8-LABEL: flat_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fc, v0
-; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_load_dword v0, v[3:4]
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT: flat_load_dword v4, v[0:1]
; GFX8-NEXT: s_mov_b64 s[4:5], 0
; GFX8-NEXT: .LBB9_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, v0
-; GFX8-NEXT: v_add_f32_e32 v0, v1, v2
-; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
+; GFX8-NEXT: v_add_f32_e32 v3, v4, v2
+; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT: v_mov_b32_e32 v4, v3
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB9_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX7-LABEL: flat_agent_atomic_fadd_ret_f32__offset12b_pos__ftz:
+; GFX7-LABEL: flat_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_add_i32_e32 v3, vcc, 0x7fc, v0
-; GFX7-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc
-; GFX7-NEXT: flat_load_dword v0, v[3:4]
+; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7fc, v0
+; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX7-NEXT: flat_load_dword v4, v[0:1]
; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: .LBB9_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v1, v0
-; GFX7-NEXT: v_add_f32_e32 v0, v1, v2
-; GFX7-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
+; GFX7-NEXT: v_add_f32_e32 v3, v4, v2
+; GFX7-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT: v_mov_b32_e32 v4, v3
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_cbranch_execnz .LBB9_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr float, ptr %ptr, i64 511
- %result = atomicrmw fadd ptr %gep, float %val syncscope("agent") seq_cst
- ret float %result
+ %unused = atomicrmw fadd ptr %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+ ret void
}
-define float @flat_agent_atomic_fadd_ret_f32__offset12b_neg__ftz(ptr %ptr, float %val) #1 {
-; GFX12-LABEL: flat_agent_atomic_fadd_ret_f32__offset12b_neg__ftz:
+define void @flat_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode(ptr %ptr, float %val) #0 {
+; GFX12-LABEL: flat_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: s_wait_expcnt 0x0
@@ -1954,53 +1983,49 @@ define float @flat_agent_atomic_fadd_ret_f32__offset12b_neg__ftz(ptr %ptr, float
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 offset:-2048 th:TH_ATOMIC_RETURN
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: flat_atomic_add_f32 v[0:1], v2 offset:2044
+; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
-; GFX940-LABEL: flat_agent_atomic_fadd_ret_f32__offset12b_neg__ftz:
+; GFX940-LABEL: flat_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
-; GFX940-NEXT: s_nop 1
-; GFX940-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 sc0
+; GFX940-NEXT: flat_atomic_add_f32 v[0:1], v2 offset:2044
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_inv sc1
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: flat_agent_atomic_fadd_ret_f32__offset12b_neg__ftz:
+; GFX11-LABEL: flat_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 glc
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT: flat_atomic_add_f32 v[0:1], v2 offset:2044
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
-; GFX10-LABEL: flat_agent_atomic_fadd_ret_f32__offset12b_neg__ftz:
+; GFX10-LABEL: flat_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
-; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, -1, v1, vcc_lo
+; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fc, v0
+; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: flat_load_dword v0, v[3:4]
+; GFX10-NEXT: flat_load_dword v4, v[0:1]
; GFX10-NEXT: .LBB10_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v1, v0
-; GFX10-NEXT: v_add_f32_e32 v0, v1, v2
+; GFX10-NEXT: v_add_f32_e32 v3, v4, v2
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
+; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX10-NEXT: v_mov_b32_e32 v4, v3
; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_cbranch_execnz .LBB10_1
@@ -2008,14 +2033,13 @@ define float @flat_agent_atomic_fadd_ret_f32__offset12b_neg__ftz(ptr %ptr, float
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX90A-LABEL: flat_agent_atomic_fadd_ret_f32__offset12b_neg__ftz:
-; GFX90A: ; %bb.0:
+; GFX90A-LABEL: flat_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
+; GFX90A: ; %bb.0: ; %atomicrmw.check.shared
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
-; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc
+; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0x7fc, v0
+; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX90A-NEXT: s_mov_b64 s[4:5], src_shared_base
-; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5
-; GFX90A-NEXT: ; implicit-def: $vgpr0
+; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB10_3
@@ -2028,119 +2052,147 @@ define float @flat_agent_atomic_fadd_ret_f32__offset12b_neg__ftz(ptr %ptr, float
; GFX90A-NEXT: s_setpc_b64 s[30:31]
; GFX90A-NEXT: .LBB10_3: ; %atomicrmw.check.private
; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base
-; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s7, v5
-; GFX90A-NEXT: ; implicit-def: $vgpr0
+; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s7, v1
; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc
; GFX90A-NEXT: s_xor_b64 s[6:7], exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execz .LBB10_5
; GFX90A-NEXT: ; %bb.4: ; %atomicrmw.global
-; GFX90A-NEXT: global_atomic_add_f32 v0, v[4:5], v2, off glc
+; GFX90A-NEXT: global_atomic_add_f32 v[0:1], v2, off
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5
+; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX90A-NEXT: ; implicit-def: $vgpr2
; GFX90A-NEXT: .LBB10_5: ; %Flow
; GFX90A-NEXT: s_andn2_saveexec_b64 s[6:7], s[6:7]
; GFX90A-NEXT: s_cbranch_execz .LBB10_7
; GFX90A-NEXT: ; %bb.6: ; %atomicrmw.private
-; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
-; GFX90A-NEXT: v_cndmask_b32_e32 v1, -1, v4, vcc
-; GFX90A-NEXT: buffer_load_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
+; GFX90A-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_add_f32_e32 v2, v0, v2
-; GFX90A-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen
+; GFX90A-NEXT: v_add_f32_e32 v1, v1, v2
+; GFX90A-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
; GFX90A-NEXT: .LBB10_7: ; %Flow1
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5
+; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX90A-NEXT: ; implicit-def: $vgpr2
; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB10_2
; GFX90A-NEXT: .LBB10_8: ; %atomicrmw.shared
-; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
-; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v4, vcc
-; GFX90A-NEXT: ds_add_rtn_f32 v0, v0, v2
+; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
+; GFX90A-NEXT: ds_add_f32 v0, v2
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
-; GFX908-LABEL: flat_agent_atomic_fadd_ret_f32__offset12b_neg__ftz:
-; GFX908: ; %bb.0:
+; GFX908-LABEL: flat_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
+; GFX908: ; %bb.0: ; %atomicrmw.check.shared
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
-; GFX908-NEXT: v_addc_co_u32_e32 v4, vcc, -1, v1, vcc
-; GFX908-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
-; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
-; GFX908-NEXT: flat_load_dword v0, v[0:1]
-; GFX908-NEXT: s_mov_b64 s[4:5], 0
-; GFX908-NEXT: .LBB10_1: ; %atomicrmw.start
-; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v1, v0
-; GFX908-NEXT: v_add_f32_e32 v0, v1, v2
-; GFX908-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
-; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX908-NEXT: v_add_co_u32_e32 v0, vcc, 0x7fc, v0
+; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX908-NEXT: s_mov_b64 s[4:5], src_shared_base
+; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1
+; GFX908-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX908-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
+; GFX908-NEXT: s_cbranch_execnz .LBB10_3
+; GFX908-NEXT: ; %bb.1: ; %Flow2
+; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
+; GFX908-NEXT: s_cbranch_execnz .LBB10_8
+; GFX908-NEXT: .LBB10_2: ; %atomicrmw.phi
+; GFX908-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: s_setpc_b64 s[30:31]
+; GFX908-NEXT: .LBB10_3: ; %atomicrmw.check.private
+; GFX908-NEXT: s_mov_b64 s[6:7], src_private_base
+; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s7, v1
+; GFX908-NEXT: s_and_saveexec_b64 s[6:7], vcc
+; GFX908-NEXT: s_xor_b64 s[6:7], exec, s[6:7]
+; GFX908-NEXT: s_cbranch_execz .LBB10_5
+; GFX908-NEXT: ; %bb.4: ; %atomicrmw.global
+; GFX908-NEXT: global_atomic_add_f32 v[0:1], v2, off
+; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT: s_cbranch_execnz .LBB10_1
-; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX908-NEXT: ; implicit-def: $vgpr2
+; GFX908-NEXT: .LBB10_5: ; %Flow
+; GFX908-NEXT: s_andn2_saveexec_b64 s[6:7], s[6:7]
+; GFX908-NEXT: s_cbranch_execz .LBB10_7
+; GFX908-NEXT: ; %bb.6: ; %atomicrmw.private
+; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; GFX908-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
+; GFX908-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: v_add_f32_e32 v1, v1, v2
+; GFX908-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
+; GFX908-NEXT: .LBB10_7: ; %Flow1
+; GFX908-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX908-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX908-NEXT: ; implicit-def: $vgpr2
+; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
+; GFX908-NEXT: s_cbranch_execz .LBB10_2
+; GFX908-NEXT: .LBB10_8: ; %atomicrmw.shared
+; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; GFX908-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
+; GFX908-NEXT: ds_add_f32 v0, v2
+; GFX908-NEXT: s_waitcnt lgkmcnt(0)
; GFX908-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
-; GFX8-LABEL: flat_agent_atomic_fadd_ret_f32__offset12b_neg__ftz:
+; GFX8-LABEL: flat_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0xfffff800, v0
-; GFX8-NEXT: v_addc_u32_e32 v4, vcc, -1, v1, vcc
-; GFX8-NEXT: flat_load_dword v0, v[3:4]
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT: flat_load_dword v4, v[0:1]
; GFX8-NEXT: s_mov_b64 s[4:5], 0
; GFX8-NEXT: .LBB10_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, v0
-; GFX8-NEXT: v_add_f32_e32 v0, v1, v2
-; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
+; GFX8-NEXT: v_add_f32_e32 v3, v4, v2
+; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT: v_mov_b32_e32 v4, v3
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB10_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX7-LABEL: flat_agent_atomic_fadd_ret_f32__offset12b_neg__ftz:
+; GFX7-LABEL: flat_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_add_i32_e32 v3, vcc, 0xfffff800, v0
-; GFX7-NEXT: v_addc_u32_e32 v4, vcc, -1, v1, vcc
-; GFX7-NEXT: flat_load_dword v0, v[3:4]
+; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7fc, v0
+; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX7-NEXT: flat_load_dword v4, v[0:1]
; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: .LBB10_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v1, v0
-; GFX7-NEXT: v_add_f32_e32 v0, v1, v2
-; GFX7-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
+; GFX7-NEXT: v_add_f32_e32 v3, v4, v2
+; GFX7-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT: v_mov_b32_e32 v4, v3
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_cbranch_execnz .LBB10_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_setpc_b64 s[30:31]
- %gep = getelementptr float, ptr %ptr, i64 -512
- %result = atomicrmw fadd ptr %gep, float %val syncscope("agent") seq_cst
- ret float %result
+ %gep = getelementptr float, ptr %ptr, i64 511
+ %unused = atomicrmw fadd ptr %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0
+ ret void
}
-define void @flat_agent_atomic_fadd_noret_f32__ftz(ptr %ptr, float %val) #1 {
-; GFX12-LABEL: flat_agent_atomic_fadd_noret_f32__ftz:
+define void @flat_agent_atomic_fadd_noret_f32_amdgpu_ignore_denormal_mode(ptr %ptr, float %val) #0 {
+; GFX12-LABEL: flat_agent_atomic_fadd_noret_f32_amdgpu_ignore_denormal_mode:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: s_wait_expcnt 0x0
@@ -2148,36 +2200,38 @@ define void @flat_agent_atomic_fadd_noret_f32__ftz(ptr %ptr, float %val) #1 {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_add_f32 v[0:1], v2
+; GFX12-NEXT: flat_atomic_add_f32 v[0:1], v2 offset:2044
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
-; GFX940-LABEL: flat_agent_atomic_fadd_noret_f32__ftz:
+; GFX940-LABEL: flat_agent_atomic_fadd_noret_f32_amdgpu_ignore_denormal_mode:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: flat_atomic_add_f32 v[0:1], v2
+; GFX940-NEXT: flat_atomic_add_f32 v[0:1], v2 offset:2044
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_inv sc1
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: flat_agent_atomic_fadd_noret_f32__ftz:
+; GFX11-LABEL: flat_agent_atomic_fadd_noret_f32_amdgpu_ignore_denormal_mode:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_add_f32 v[0:1], v2
+; GFX11-NEXT: flat_atomic_add_f32 v[0:1], v2 offset:2044
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
-; GFX10-LABEL: flat_agent_atomic_fadd_noret_f32__ftz:
+; GFX10-LABEL: flat_agent_atomic_fadd_noret_f32_amdgpu_ignore_denormal_mode:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: flat_load_dword v4, v[0:1]
+; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fc, v0
+; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX10-NEXT: s_mov_b32 s4, 0
+; GFX10-NEXT: flat_load_dword v4, v[0:1]
; GFX10-NEXT: .LBB11_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -2196,9 +2250,11 @@ define void @flat_agent_atomic_fadd_noret_f32__ftz(ptr %ptr, float %val) #1 {
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX90A-LABEL: flat_agent_atomic_fadd_noret_f32__ftz:
+; GFX90A-LABEL: flat_agent_atomic_fadd_noret_f32_amdgpu_ignore_denormal_mode:
; GFX90A: ; %bb.0: ; %atomicrmw.check.shared
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0x7fc, v0
+; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX90A-NEXT: s_mov_b64 s[4:5], src_shared_base
; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
@@ -2248,9 +2304,11 @@ define void @flat_agent_atomic_fadd_noret_f32__ftz(ptr %ptr, float %val) #1 {
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
-; GFX908-LABEL: flat_agent_atomic_fadd_noret_f32__ftz:
+; GFX908-LABEL: flat_agent_atomic_fadd_noret_f32_amdgpu_ignore_denormal_mode:
; GFX908: ; %bb.0: ; %atomicrmw.check.shared
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT: v_add_co_u32_e32 v0, vcc, 0x7fc, v0
+; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX908-NEXT: s_mov_b64 s[4:5], src_shared_base
; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1
; GFX908-NEXT: s_and_saveexec_b64 s[4:5], vcc
@@ -2300,9 +2358,11 @@ define void @flat_agent_atomic_fadd_noret_f32__ftz(ptr %ptr, float %val) #1 {
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
-; GFX8-LABEL: flat_agent_atomic_fadd_noret_f32__ftz:
+; GFX8-LABEL: flat_agent_atomic_fadd_noret_f32_amdgpu_ignore_denormal_mode:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v4, v[0:1]
; GFX8-NEXT: s_mov_b64 s[4:5], 0
; GFX8-NEXT: .LBB11_1: ; %atomicrmw.start
@@ -2321,9 +2381,11 @@ define void @flat_agent_atomic_fadd_noret_f32__ftz(ptr %ptr, float %val) #1 {
; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX7-LABEL: flat_agent_atomic_fadd_noret_f32__ftz:
+; GFX7-LABEL: flat_agent_atomic_fadd_noret_f32_amdgpu_ignore_denormal_mode:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7fc, v0
+; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX7-NEXT: flat_load_dword v4, v[0:1]
; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: .LBB11_1: ; %atomicrmw.start
@@ -2341,12 +2403,17 @@ define void @flat_agent_atomic_fadd_noret_f32__ftz(ptr %ptr, float %val) #1 {
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_setpc_b64 s[30:31]
- %unused = atomicrmw fadd ptr %ptr, float %val syncscope("agent") seq_cst
+ %gep = getelementptr float, ptr %ptr, i64 511
+ %unused = atomicrmw fadd ptr %gep, float %val syncscope("agent") seq_cst, !amdgpu.ignore.denormal.mode !0
ret void
}
-define void @flat_agent_atomic_fadd_noret_f32__offset12b_pos__ftz(ptr %ptr, float %val) #1 {
-; GFX12-LABEL: flat_agent_atomic_fadd_noret_f32__offset12b_pos__ftz:
+; --------------------------------------------------------------------
+; float with ftz/daz
+; --------------------------------------------------------------------
+
+define float @flat_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_fine_grained_memory(ptr %ptr, float %val) #1 {
+; GFX12-LABEL: flat_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_fine_grained_memory:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: s_wait_expcnt 0x0
@@ -2354,41 +2421,39 @@ define void @flat_agent_atomic_fadd_noret_f32__offset12b_pos__ftz(ptr %ptr, floa
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_add_f32 v[0:1], v2 offset:2044
-; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX12-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
-; GFX940-LABEL: flat_agent_atomic_fadd_noret_f32__offset12b_pos__ftz:
+; GFX940-LABEL: flat_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_fine_grained_memory:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: flat_atomic_add_f32 v[0:1], v2 offset:2044
+; GFX940-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 sc0
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_inv sc1
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: flat_agent_atomic_fadd_noret_f32__offset12b_pos__ftz:
+; GFX11-LABEL: flat_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_fine_grained_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_add_f32 v[0:1], v2 offset:2044
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 glc
+; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
-; GFX10-LABEL: flat_agent_atomic_fadd_noret_f32__offset12b_pos__ftz:
+; GFX10-LABEL: flat_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fc, v0
-; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX10-NEXT: flat_load_dword v3, v[0:1]
; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: flat_load_dword v4, v[0:1]
; GFX10-NEXT: .LBB12_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_mov_b32_e32 v4, v3
; GFX10-NEXT: v_add_f32_e32 v3, v4, v2
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
@@ -2396,174 +2461,135 @@ define void @flat_agent_atomic_fadd_noret_f32__offset12b_pos__ftz(ptr %ptr, floa
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX10-NEXT: v_mov_b32_e32 v4, v3
; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_cbranch_execnz .LBB12_1
; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT: v_mov_b32_e32 v0, v3
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX90A-LABEL: flat_agent_atomic_fadd_noret_f32__offset12b_pos__ftz:
-; GFX90A: ; %bb.0: ; %atomicrmw.check.shared
+; GFX90A-LABEL: flat_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_fine_grained_memory:
+; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0x7fc, v0
-; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX90A-NEXT: s_mov_b64 s[4:5], src_shared_base
; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1
+; GFX90A-NEXT: ; implicit-def: $vgpr3
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
-; GFX90A-NEXT: s_cbranch_execnz .LBB12_3
-; GFX90A-NEXT: ; %bb.1: ; %Flow2
-; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GFX90A-NEXT: s_cbranch_execnz .LBB12_8
-; GFX90A-NEXT: .LBB12_2: ; %atomicrmw.phi
-; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: s_setpc_b64 s[30:31]
-; GFX90A-NEXT: .LBB12_3: ; %atomicrmw.check.private
+; GFX90A-NEXT: s_cbranch_execz .LBB12_6
+; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.check.private
; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base
; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s7, v1
+; GFX90A-NEXT: ; implicit-def: $vgpr3
; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc
; GFX90A-NEXT: s_xor_b64 s[6:7], exec, s[6:7]
-; GFX90A-NEXT: s_cbranch_execz .LBB12_5
-; GFX90A-NEXT: ; %bb.4: ; %atomicrmw.global
-; GFX90A-NEXT: global_atomic_add_f32 v[0:1], v2, off
+; GFX90A-NEXT: s_cbranch_execz .LBB12_3
+; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.global
+; GFX90A-NEXT: global_atomic_add_f32 v3, v[0:1], v2, off glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX90A-NEXT: ; implicit-def: $vgpr2
-; GFX90A-NEXT: .LBB12_5: ; %Flow
+; GFX90A-NEXT: .LBB12_3: ; %Flow
; GFX90A-NEXT: s_andn2_saveexec_b64 s[6:7], s[6:7]
-; GFX90A-NEXT: s_cbranch_execz .LBB12_7
-; GFX90A-NEXT: ; %bb.6: ; %atomicrmw.private
+; GFX90A-NEXT: s_cbranch_execz .LBB12_5
+; GFX90A-NEXT: ; %bb.4: ; %atomicrmw.private
; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
-; GFX90A-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen
+; GFX90A-NEXT: buffer_load_dword v3, v0, s[0:3], 0 offen
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_add_f32_e32 v1, v1, v2
+; GFX90A-NEXT: v_add_f32_e32 v1, v3, v2
; GFX90A-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
-; GFX90A-NEXT: .LBB12_7: ; %Flow1
+; GFX90A-NEXT: .LBB12_5: ; %Flow1
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX90A-NEXT: ; implicit-def: $vgpr2
+; GFX90A-NEXT: .LBB12_6: ; %Flow2
; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GFX90A-NEXT: s_cbranch_execz .LBB12_2
-; GFX90A-NEXT: .LBB12_8: ; %atomicrmw.shared
+; GFX90A-NEXT: s_cbranch_execz .LBB12_8
+; GFX90A-NEXT: ; %bb.7: ; %atomicrmw.shared
; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
-; GFX90A-NEXT: ds_add_f32 v0, v2
+; GFX90A-NEXT: ds_add_rtn_f32 v3, v0, v2
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT: .LBB12_8: ; %atomicrmw.phi
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v3
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
-; GFX908-LABEL: flat_agent_atomic_fadd_noret_f32__offset12b_pos__ftz:
-; GFX908: ; %bb.0: ; %atomicrmw.check.shared
+; GFX908-LABEL: flat_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_fine_grained_memory:
+; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_add_co_u32_e32 v0, vcc, 0x7fc, v0
-; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX908-NEXT: s_mov_b64 s[4:5], src_shared_base
-; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1
-; GFX908-NEXT: s_and_saveexec_b64 s[4:5], vcc
-; GFX908-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
-; GFX908-NEXT: s_cbranch_execnz .LBB12_3
-; GFX908-NEXT: ; %bb.1: ; %Flow2
-; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GFX908-NEXT: s_cbranch_execnz .LBB12_8
-; GFX908-NEXT: .LBB12_2: ; %atomicrmw.phi
+; GFX908-NEXT: flat_load_dword v3, v[0:1]
+; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: .LBB12_1: ; %atomicrmw.start
+; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX908-NEXT: v_mov_b32_e32 v4, v3
+; GFX908-NEXT: v_add_f32_e32 v3, v4, v2
+; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX908-NEXT: buffer_wbinvl1
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX908-NEXT: s_cbranch_execnz .LBB12_1
+; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX908-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: s_setpc_b64 s[30:31]
-; GFX908-NEXT: .LBB12_3: ; %atomicrmw.check.private
-; GFX908-NEXT: s_mov_b64 s[6:7], src_private_base
-; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s7, v1
-; GFX908-NEXT: s_and_saveexec_b64 s[6:7], vcc
-; GFX908-NEXT: s_xor_b64 s[6:7], exec, s[6:7]
-; GFX908-NEXT: s_cbranch_execz .LBB12_5
-; GFX908-NEXT: ; %bb.4: ; %atomicrmw.global
-; GFX908-NEXT: global_atomic_add_f32 v[0:1], v2, off
-; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX908-NEXT: ; implicit-def: $vgpr2
-; GFX908-NEXT: .LBB12_5: ; %Flow
-; GFX908-NEXT: s_andn2_saveexec_b64 s[6:7], s[6:7]
-; GFX908-NEXT: s_cbranch_execz .LBB12_7
-; GFX908-NEXT: ; %bb.6: ; %atomicrmw.private
-; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
-; GFX908-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
-; GFX908-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen
-; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_add_f32_e32 v1, v1, v2
-; GFX908-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
-; GFX908-NEXT: .LBB12_7: ; %Flow1
-; GFX908-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX908-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX908-NEXT: ; implicit-def: $vgpr2
-; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GFX908-NEXT: s_cbranch_execz .LBB12_2
-; GFX908-NEXT: .LBB12_8: ; %atomicrmw.shared
-; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
-; GFX908-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
-; GFX908-NEXT: ds_add_f32 v0, v2
-; GFX908-NEXT: s_waitcnt lgkmcnt(0)
-; GFX908-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: v_mov_b32_e32 v0, v3
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
-; GFX8-LABEL: flat_agent_atomic_fadd_noret_f32__offset12b_pos__ftz:
+; GFX8-LABEL: flat_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_fine_grained_memory:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0
-; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_load_dword v4, v[0:1]
+; GFX8-NEXT: flat_load_dword v3, v[0:1]
; GFX8-NEXT: s_mov_b64 s[4:5], 0
; GFX8-NEXT: .LBB12_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v4, v3
; GFX8-NEXT: v_add_f32_e32 v3, v4, v2
; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT: v_mov_b32_e32 v4, v3
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB12_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT: v_mov_b32_e32 v0, v3
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX7-LABEL: flat_agent_atomic_fadd_noret_f32__offset12b_pos__ftz:
+; GFX7-LABEL: flat_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_fine_grained_memory:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7fc, v0
-; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX7-NEXT: flat_load_dword v4, v[0:1]
+; GFX7-NEXT: flat_load_dword v3, v[0:1]
; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: .LBB12_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v4, v3
; GFX7-NEXT: v_add_f32_e32 v3, v4, v2
; GFX7-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT: v_mov_b32_e32 v4, v3
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_cbranch_execnz .LBB12_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX7-NEXT: v_mov_b32_e32 v0, v3
; GFX7-NEXT: s_setpc_b64 s[30:31]
- %gep = getelementptr float, ptr %ptr, i64 511
- %unused = atomicrmw fadd ptr %gep, float %val syncscope("agent") seq_cst
- ret void
+ %result = atomicrmw fadd ptr %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+ ret float %result
}
-define void @flat_agent_atomic_fadd_noret_f32__offset12b_neg__ftz(ptr %ptr, float %val) #1 {
-; GFX12-LABEL: flat_agent_atomic_fadd_noret_f32__offset12b_neg__ftz:
+define float @flat_agent_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory(ptr %ptr, float %val) #1 {
+; GFX12-LABEL: flat_agent_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: s_wait_expcnt 0x0
@@ -2571,54 +2597,48 @@ define void @flat_agent_atomic_fadd_noret_f32__offset12b_neg__ftz(ptr %ptr, floa
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_add_f32 v[0:1], v2 offset:-2048
-; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX12-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 offset:2044 th:TH_ATOMIC_RETURN
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
-; GFX940-LABEL: flat_agent_atomic_fadd_noret_f32__offset12b_neg__ftz:
+; GFX940-LABEL: flat_agent_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
-; GFX940-NEXT: s_nop 1
-; GFX940-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: flat_atomic_add_f32 v[0:1], v2
+; GFX940-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 offset:2044 sc0
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_inv sc1
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: flat_agent_atomic_fadd_noret_f32__offset12b_neg__ftz:
+; GFX11-LABEL: flat_agent_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_add_f32 v[0:1], v2
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 offset:2044 glc
+; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
-; GFX10-LABEL: flat_agent_atomic_fadd_noret_f32__offset12b_neg__ftz:
+; GFX10-LABEL: flat_agent_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
-; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fc, v0
+; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, 0, v1, vcc_lo
; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: flat_load_dword v4, v[0:1]
+; GFX10-NEXT: flat_load_dword v0, v[3:4]
; GFX10-NEXT: .LBB13_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_add_f32_e32 v3, v4, v2
+; GFX10-NEXT: v_mov_b32_e32 v1, v0
+; GFX10-NEXT: v_add_f32_e32 v0, v1, v2
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX10-NEXT: v_mov_b32_e32 v4, v3
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_cbranch_execnz .LBB13_1
@@ -2626,13 +2646,14 @@ define void @flat_agent_atomic_fadd_noret_f32__offset12b_neg__ftz(ptr %ptr, floa
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX90A-LABEL: flat_agent_atomic_fadd_noret_f32__offset12b_neg__ftz:
-; GFX90A: ; %bb.0: ; %atomicrmw.check.shared
+; GFX90A-LABEL: flat_agent_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
+; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
-; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fc, v0
+; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
; GFX90A-NEXT: s_mov_b64 s[4:5], src_shared_base
-; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1
+; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5
+; GFX90A-NEXT: ; implicit-def: $vgpr0
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB13_3
@@ -2645,215 +2666,157 @@ define void @flat_agent_atomic_fadd_noret_f32__offset12b_neg__ftz(ptr %ptr, floa
; GFX90A-NEXT: s_setpc_b64 s[30:31]
; GFX90A-NEXT: .LBB13_3: ; %atomicrmw.check.private
; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base
-; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s7, v1
+; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s7, v5
+; GFX90A-NEXT: ; implicit-def: $vgpr0
; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc
; GFX90A-NEXT: s_xor_b64 s[6:7], exec, s[6:7]
; GFX90A-NEXT: s_cbranch_execz .LBB13_5
; GFX90A-NEXT: ; %bb.4: ; %atomicrmw.global
-; GFX90A-NEXT: global_atomic_add_f32 v[0:1], v2, off
+; GFX90A-NEXT: global_atomic_add_f32 v0, v[4:5], v2, off glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX90A-NEXT: ; implicit-def: $vgpr2
; GFX90A-NEXT: .LBB13_5: ; %Flow
; GFX90A-NEXT: s_andn2_saveexec_b64 s[6:7], s[6:7]
; GFX90A-NEXT: s_cbranch_execz .LBB13_7
; GFX90A-NEXT: ; %bb.6: ; %atomicrmw.private
-; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
-; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
-; GFX90A-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen
+; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
+; GFX90A-NEXT: v_cndmask_b32_e32 v1, -1, v4, vcc
+; GFX90A-NEXT: buffer_load_dword v0, v1, s[0:3], 0 offen
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_add_f32_e32 v1, v1, v2
-; GFX90A-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
+; GFX90A-NEXT: v_add_f32_e32 v2, v0, v2
+; GFX90A-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen
; GFX90A-NEXT: .LBB13_7: ; %Flow1
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX90A-NEXT: ; implicit-def: $vgpr2
; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; GFX90A-NEXT: s_cbranch_execz .LBB13_2
; GFX90A-NEXT: .LBB13_8: ; %atomicrmw.shared
-; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
-; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
-; GFX90A-NEXT: ds_add_f32 v0, v2
+; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
+; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v4, vcc
+; GFX90A-NEXT: ds_add_rtn_f32 v0, v0, v2
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
-; GFX908-LABEL: flat_agent_atomic_fadd_noret_f32__offset12b_neg__ftz:
-; GFX908: ; %bb.0: ; %atomicrmw.check.shared
+; GFX908-LABEL: flat_agent_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
+; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
-; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
-; GFX908-NEXT: s_mov_b64 s[4:5], src_shared_base
-; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1
-; GFX908-NEXT: s_and_saveexec_b64 s[4:5], vcc
-; GFX908-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
-; GFX908-NEXT: s_cbranch_execnz .LBB13_3
-; GFX908-NEXT: ; %bb.1: ; %Flow2
-; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GFX908-NEXT: s_cbranch_execnz .LBB13_8
-; GFX908-NEXT: .LBB13_2: ; %atomicrmw.phi
-; GFX908-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: s_setpc_b64 s[30:31]
-; GFX908-NEXT: .LBB13_3: ; %atomicrmw.check.private
-; GFX908-NEXT: s_mov_b64 s[6:7], src_private_base
-; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s7, v1
-; GFX908-NEXT: s_and_saveexec_b64 s[6:7], vcc
-; GFX908-NEXT: s_xor_b64 s[6:7], exec, s[6:7]
-; GFX908-NEXT: s_cbranch_execz .LBB13_5
-; GFX908-NEXT: ; %bb.4: ; %atomicrmw.global
-; GFX908-NEXT: global_atomic_add_f32 v[0:1], v2, off
-; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044
+; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: .LBB13_1: ; %atomicrmw.start
+; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX908-NEXT: v_mov_b32_e32 v4, v3
+; GFX908-NEXT: v_add_f32_e32 v3, v4, v2
+; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc
+; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX908-NEXT: ; implicit-def: $vgpr2
-; GFX908-NEXT: .LBB13_5: ; %Flow
-; GFX908-NEXT: s_andn2_saveexec_b64 s[6:7], s[6:7]
-; GFX908-NEXT: s_cbranch_execz .LBB13_7
-; GFX908-NEXT: ; %bb.6: ; %atomicrmw.private
-; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
-; GFX908-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
-; GFX908-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen
-; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_add_f32_e32 v1, v1, v2
-; GFX908-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
-; GFX908-NEXT: .LBB13_7: ; %Flow1
-; GFX908-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX908-NEXT: ; implicit-def: $vgpr0_vgpr1
-; GFX908-NEXT: ; implicit-def: $vgpr2
-; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
-; GFX908-NEXT: s_cbranch_execz .LBB13_2
-; GFX908-NEXT: .LBB13_8: ; %atomicrmw.shared
-; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
-; GFX908-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
-; GFX908-NEXT: ds_add_f32 v0, v2
-; GFX908-NEXT: s_waitcnt lgkmcnt(0)
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX908-NEXT: s_cbranch_execnz .LBB13_1
+; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX908-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: v_mov_b32_e32 v0, v3
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
-; GFX8-LABEL: flat_agent_atomic_fadd_noret_f32__offset12b_neg__ftz:
+; GFX8-LABEL: flat_agent_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0xfffff800, v0
-; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc
-; GFX8-NEXT: flat_load_dword v4, v[0:1]
+; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fc, v0
+; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc
+; GFX8-NEXT: flat_load_dword v0, v[3:4]
; GFX8-NEXT: s_mov_b64 s[4:5], 0
; GFX8-NEXT: .LBB13_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_add_f32_e32 v3, v4, v2
-; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT: v_mov_b32_e32 v1, v0
+; GFX8-NEXT: v_add_f32_e32 v0, v1, v2
+; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT: v_mov_b32_e32 v4, v3
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB13_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX7-LABEL: flat_agent_atomic_fadd_noret_f32__offset12b_neg__ftz:
+; GFX7-LABEL: flat_agent_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0
-; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc
-; GFX7-NEXT: flat_load_dword v4, v[0:1]
+; GFX7-NEXT: v_add_i32_e32 v3, vcc, 0x7fc, v0
+; GFX7-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc
+; GFX7-NEXT: flat_load_dword v0, v[3:4]
; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: .LBB13_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_add_f32_e32 v3, v4, v2
-; GFX7-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX7-NEXT: v_mov_b32_e32 v1, v0
+; GFX7-NEXT: v_add_f32_e32 v0, v1, v2
+; GFX7-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT: v_mov_b32_e32 v4, v3
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_cbranch_execnz .LBB13_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_setpc_b64 s[30:31]
- %gep = getelementptr float, ptr %ptr, i64 -512
- %unused = atomicrmw fadd ptr %gep, float %val syncscope("agent") seq_cst
- ret void
+ %gep = getelementptr float, ptr %ptr, i64 511
+ %result = atomicrmw fadd ptr %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+ ret float %result
}
-define float @flat_system_atomic_fadd_ret_f32__offset12b_pos__ftz(ptr %ptr, float %val) #1 {
-; GFX12-LABEL: flat_system_atomic_fadd_ret_f32__offset12b_pos__ftz:
+define float @flat_agent_atomic_fadd_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory(ptr %ptr, float %val) #1 {
+; GFX12-LABEL: flat_agent_atomic_fadd_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: s_wait_expcnt 0x0
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2044
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: .LBB14_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v4, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_add_f32_e32 v3, v4, v2
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN
+; GFX12-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 offset:-2048 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_SYS
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB14_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
-; GFX940-LABEL: flat_system_atomic_fadd_ret_f32__offset12b_pos__ftz:
+; GFX940-LABEL: flat_agent_atomic_fadd_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: buffer_wbl2 sc0 sc1
-; GFX940-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 offset:2044 sc0 sc1
+; GFX940-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
+; GFX940-NEXT: s_nop 1
+; GFX940-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX940-NEXT: buffer_wbl2 sc1
+; GFX940-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 sc0
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT: buffer_inv sc0 sc1
+; GFX940-NEXT: buffer_inv sc1
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: flat_system_atomic_fadd_ret_f32__offset12b_pos__ftz:
+; GFX11-LABEL: flat_agent_atomic_fadd_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2044
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: .LBB14_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v4, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_add_f32_e32 v3, v4, v2
+; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
+; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 glc
+; GFX11-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 glc
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB14_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: v_mov_b32_e32 v0, v3
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
-; GFX10-LABEL: flat_system_atomic_fadd_ret_f32__offset12b_pos__ftz:
+; GFX10-LABEL: flat_agent_atomic_fadd_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fc, v0
-; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, 0, v1, vcc_lo
+; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
+; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, -1, v1, vcc_lo
; GFX10-NEXT: s_mov_b32 s4, 0
; GFX10-NEXT: flat_load_dword v0, v[3:4]
; GFX10-NEXT: .LBB14_1: ; %atomicrmw.start
@@ -2874,57 +2837,92 @@ define float @flat_system_atomic_fadd_ret_f32__offset12b_pos__ftz(ptr %ptr, floa
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX90A-LABEL: flat_system_atomic_fadd_ret_f32__offset12b_pos__ftz:
+; GFX90A-LABEL: flat_agent_atomic_fadd_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044
-; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: .LBB14_1: ; %atomicrmw.start
-; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v5, v3
-; GFX90A-NEXT: v_add_f32_e32 v4, v5, v2
-; GFX90A-NEXT: buffer_wbl2
-; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc
-; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: buffer_invl2
-; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
-; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: s_cbranch_execnz .LBB14_1
-; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: v_mov_b32_e32 v0, v3
-; GFX90A-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX908-LABEL: flat_system_atomic_fadd_ret_f32__offset12b_pos__ftz:
-; GFX908: ; %bb.0:
-; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044
+; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
+; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc
+; GFX90A-NEXT: s_mov_b64 s[4:5], src_shared_base
+; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v5
+; GFX90A-NEXT: ; implicit-def: $vgpr0
+; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
+; GFX90A-NEXT: s_cbranch_execnz .LBB14_3
+; GFX90A-NEXT: ; %bb.1: ; %Flow2
+; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
+; GFX90A-NEXT: s_cbranch_execnz .LBB14_8
+; GFX90A-NEXT: .LBB14_2: ; %atomicrmw.phi
+; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: s_setpc_b64 s[30:31]
+; GFX90A-NEXT: .LBB14_3: ; %atomicrmw.check.private
+; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base
+; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s7, v5
+; GFX90A-NEXT: ; implicit-def: $vgpr0
+; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc
+; GFX90A-NEXT: s_xor_b64 s[6:7], exec, s[6:7]
+; GFX90A-NEXT: s_cbranch_execz .LBB14_5
+; GFX90A-NEXT: ; %bb.4: ; %atomicrmw.global
+; GFX90A-NEXT: global_atomic_add_f32 v0, v[4:5], v2, off glc
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: buffer_wbinvl1
+; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5
+; GFX90A-NEXT: ; implicit-def: $vgpr2
+; GFX90A-NEXT: .LBB14_5: ; %Flow
+; GFX90A-NEXT: s_andn2_saveexec_b64 s[6:7], s[6:7]
+; GFX90A-NEXT: s_cbranch_execz .LBB14_7
+; GFX90A-NEXT: ; %bb.6: ; %atomicrmw.private
+; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
+; GFX90A-NEXT: v_cndmask_b32_e32 v1, -1, v4, vcc
+; GFX90A-NEXT: buffer_load_dword v0, v1, s[0:3], 0 offen
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: v_add_f32_e32 v2, v0, v2
+; GFX90A-NEXT: buffer_store_dword v2, v1, s[0:3], 0 offen
+; GFX90A-NEXT: .LBB14_7: ; %Flow1
+; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5
+; GFX90A-NEXT: ; implicit-def: $vgpr2
+; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
+; GFX90A-NEXT: s_cbranch_execz .LBB14_2
+; GFX90A-NEXT: .LBB14_8: ; %atomicrmw.shared
+; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
+; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v4, vcc
+; GFX90A-NEXT: ds_add_rtn_f32 v0, v0, v2
+; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: flat_agent_atomic_fadd_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
+; GFX908: ; %bb.0:
+; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
+; GFX908-NEXT: v_addc_co_u32_e32 v4, vcc, -1, v1, vcc
+; GFX908-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
+; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX908-NEXT: flat_load_dword v0, v[0:1]
; GFX908-NEXT: s_mov_b64 s[4:5], 0
; GFX908-NEXT: .LBB14_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v4, v3
-; GFX908-NEXT: v_add_f32_e32 v3, v4, v2
-; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc
+; GFX908-NEXT: v_mov_b32_e32 v1, v0
+; GFX908-NEXT: v_add_f32_e32 v0, v1, v2
+; GFX908-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB14_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX908-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX908-NEXT: v_mov_b32_e32 v0, v3
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
-; GFX8-LABEL: flat_system_atomic_fadd_ret_f32__offset12b_pos__ftz:
+; GFX8-LABEL: flat_agent_atomic_fadd_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fc, v0
-; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc
+; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0xfffff800, v0
+; GFX8-NEXT: v_addc_u32_e32 v4, vcc, -1, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[3:4]
; GFX8-NEXT: s_mov_b64 s[4:5], 0
; GFX8-NEXT: .LBB14_1: ; %atomicrmw.start
@@ -2943,11 +2941,11 @@ define float @flat_system_atomic_fadd_ret_f32__offset12b_pos__ftz(ptr %ptr, floa
; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX7-LABEL: flat_system_atomic_fadd_ret_f32__offset12b_pos__ftz:
+; GFX7-LABEL: flat_agent_atomic_fadd_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_add_i32_e32 v3, vcc, 0x7fc, v0
-; GFX7-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc
+; GFX7-NEXT: v_add_i32_e32 v3, vcc, 0xfffff800, v0
+; GFX7-NEXT: v_addc_u32_e32 v4, vcc, -1, v1, vcc
; GFX7-NEXT: flat_load_dword v0, v[3:4]
; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: .LBB14_1: ; %atomicrmw.start
@@ -2965,79 +2963,50 @@ define float @flat_system_atomic_fadd_ret_f32__offset12b_pos__ftz(ptr %ptr, floa
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_setpc_b64 s[30:31]
- %gep = getelementptr float, ptr %ptr, i64 511
- %result = atomicrmw fadd ptr %gep, float %val seq_cst
+ %gep = getelementptr float, ptr %ptr, i64 -512
+ %result = atomicrmw fadd ptr %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
ret float %result
}
-define void @flat_system_atomic_fadd_noret_f32__offset12b_pos__ftz(ptr %ptr, float %val) #1 {
-; GFX12-LABEL: flat_system_atomic_fadd_noret_f32__offset12b_pos__ftz:
+define void @flat_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_fine_grained_memory(ptr %ptr, float %val) #1 {
+; GFX12-LABEL: flat_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_fine_grained_memory:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: s_wait_expcnt 0x0
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: flat_load_b32 v4, v[0:1] offset:2044
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: .LBB15_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_add_f32_e32 v3, v4, v2
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_SYS
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT: v_mov_b32_e32 v4, v3
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB15_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: flat_atomic_add_f32 v[0:1], v2
+; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
-; GFX940-LABEL: flat_system_atomic_fadd_noret_f32__offset12b_pos__ftz:
+; GFX940-LABEL: flat_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_fine_grained_memory:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: buffer_wbl2 sc0 sc1
-; GFX940-NEXT: flat_atomic_add_f32 v[0:1], v2 offset:2044 sc1
+; GFX940-NEXT: buffer_wbl2 sc1
+; GFX940-NEXT: flat_atomic_add_f32 v[0:1], v2
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT: buffer_inv sc0 sc1
+; GFX940-NEXT: buffer_inv sc1
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: flat_system_atomic_fadd_noret_f32__offset12b_pos__ftz:
+; GFX11-LABEL: flat_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_fine_grained_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: flat_load_b32 v4, v[0:1] offset:2044
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: .LBB15_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_add_f32_e32 v3, v4, v2
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 glc
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT: flat_atomic_add_f32 v[0:1], v2
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT: v_mov_b32_e32 v4, v3
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB15_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
-; GFX10-LABEL: flat_system_atomic_fadd_noret_f32__offset12b_pos__ftz:
+; GFX10-LABEL: flat_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fc, v0
-; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
-; GFX10-NEXT: s_mov_b32 s4, 0
; GFX10-NEXT: flat_load_dword v4, v[0:1]
+; GFX10-NEXT: s_mov_b32 s4, 0
; GFX10-NEXT: .LBB15_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -3056,55 +3025,113 @@ define void @flat_system_atomic_fadd_noret_f32__offset12b_pos__ftz(ptr %ptr, flo
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX90A-LABEL: flat_system_atomic_fadd_noret_f32__offset12b_pos__ftz:
-; GFX90A: ; %bb.0:
+; GFX90A-LABEL: flat_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_fine_grained_memory:
+; GFX90A: ; %bb.0: ; %atomicrmw.check.shared
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: flat_load_dword v5, v[0:1] offset:2044
-; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: .LBB15_1: ; %atomicrmw.start
-; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_add_f32_e32 v4, v5, v2
-; GFX90A-NEXT: buffer_wbl2
-; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc
-; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: buffer_invl2
+; GFX90A-NEXT: s_mov_b64 s[4:5], src_shared_base
+; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1
+; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
+; GFX90A-NEXT: s_cbranch_execnz .LBB15_3
+; GFX90A-NEXT: ; %bb.1: ; %Flow2
+; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
+; GFX90A-NEXT: s_cbranch_execnz .LBB15_8
+; GFX90A-NEXT: .LBB15_2: ; %atomicrmw.phi
+; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: s_setpc_b64 s[30:31]
+; GFX90A-NEXT: .LBB15_3: ; %atomicrmw.check.private
+; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base
+; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s7, v1
+; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc
+; GFX90A-NEXT: s_xor_b64 s[6:7], exec, s[6:7]
+; GFX90A-NEXT: s_cbranch_execz .LBB15_5
+; GFX90A-NEXT: ; %bb.4: ; %atomicrmw.global
+; GFX90A-NEXT: global_atomic_add_f32 v[0:1], v2, off
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
-; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: v_mov_b32_e32 v5, v3
-; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: s_cbranch_execnz .LBB15_1
-; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX90A-NEXT: ; implicit-def: $vgpr2
+; GFX90A-NEXT: .LBB15_5: ; %Flow
+; GFX90A-NEXT: s_andn2_saveexec_b64 s[6:7], s[6:7]
+; GFX90A-NEXT: s_cbranch_execz .LBB15_7
+; GFX90A-NEXT: ; %bb.6: ; %atomicrmw.private
+; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
+; GFX90A-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: v_add_f32_e32 v1, v1, v2
+; GFX90A-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
+; GFX90A-NEXT: .LBB15_7: ; %Flow1
+; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX90A-NEXT: ; implicit-def: $vgpr2
+; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
+; GFX90A-NEXT: s_cbranch_execz .LBB15_2
+; GFX90A-NEXT: .LBB15_8: ; %atomicrmw.shared
+; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
+; GFX90A-NEXT: ds_add_f32 v0, v2
+; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
-; GFX908-LABEL: flat_system_atomic_fadd_noret_f32__offset12b_pos__ftz:
-; GFX908: ; %bb.0:
+; GFX908-LABEL: flat_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_fine_grained_memory:
+; GFX908: ; %bb.0: ; %atomicrmw.check.shared
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: flat_load_dword v4, v[0:1] offset:2044
-; GFX908-NEXT: s_mov_b64 s[4:5], 0
-; GFX908-NEXT: .LBB15_1: ; %atomicrmw.start
-; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_add_f32_e32 v3, v4, v2
-; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc
-; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX908-NEXT: s_mov_b64 s[4:5], src_shared_base
+; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1
+; GFX908-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX908-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
+; GFX908-NEXT: s_cbranch_execnz .LBB15_3
+; GFX908-NEXT: ; %bb.1: ; %Flow2
+; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
+; GFX908-NEXT: s_cbranch_execnz .LBB15_8
+; GFX908-NEXT: .LBB15_2: ; %atomicrmw.phi
+; GFX908-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: s_setpc_b64 s[30:31]
+; GFX908-NEXT: .LBB15_3: ; %atomicrmw.check.private
+; GFX908-NEXT: s_mov_b64 s[6:7], src_private_base
+; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s7, v1
+; GFX908-NEXT: s_and_saveexec_b64 s[6:7], vcc
+; GFX908-NEXT: s_xor_b64 s[6:7], exec, s[6:7]
+; GFX908-NEXT: s_cbranch_execz .LBB15_5
+; GFX908-NEXT: ; %bb.4: ; %atomicrmw.global
+; GFX908-NEXT: global_atomic_add_f32 v[0:1], v2, off
+; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
-; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT: v_mov_b32_e32 v4, v3
-; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT: s_cbranch_execnz .LBB15_1
-; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX908-NEXT: ; implicit-def: $vgpr2
+; GFX908-NEXT: .LBB15_5: ; %Flow
+; GFX908-NEXT: s_andn2_saveexec_b64 s[6:7], s[6:7]
+; GFX908-NEXT: s_cbranch_execz .LBB15_7
+; GFX908-NEXT: ; %bb.6: ; %atomicrmw.private
+; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; GFX908-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
+; GFX908-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: v_add_f32_e32 v1, v1, v2
+; GFX908-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
+; GFX908-NEXT: .LBB15_7: ; %Flow1
+; GFX908-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX908-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX908-NEXT: ; implicit-def: $vgpr2
+; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
+; GFX908-NEXT: s_cbranch_execz .LBB15_2
+; GFX908-NEXT: .LBB15_8: ; %atomicrmw.shared
+; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; GFX908-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
+; GFX908-NEXT: ds_add_f32 v0, v2
+; GFX908-NEXT: s_waitcnt lgkmcnt(0)
; GFX908-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
-; GFX8-LABEL: flat_system_atomic_fadd_noret_f32__offset12b_pos__ftz:
+; GFX8-LABEL: flat_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_fine_grained_memory:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0
-; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v4, v[0:1]
; GFX8-NEXT: s_mov_b64 s[4:5], 0
; GFX8-NEXT: .LBB15_1: ; %atomicrmw.start
@@ -3123,11 +3150,9 @@ define void @flat_system_atomic_fadd_noret_f32__offset12b_pos__ftz(ptr %ptr, flo
; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX7-LABEL: flat_system_atomic_fadd_noret_f32__offset12b_pos__ftz:
+; GFX7-LABEL: flat_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_fine_grained_memory:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7fc, v0
-; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX7-NEXT: flat_load_dword v4, v[0:1]
; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: .LBB15_1: ; %atomicrmw.start
@@ -3145,279 +3170,284 @@ define void @flat_system_atomic_fadd_noret_f32__offset12b_pos__ftz(ptr %ptr, flo
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_setpc_b64 s[30:31]
- %gep = getelementptr float, ptr %ptr, i64 511
- %unused = atomicrmw fadd ptr %gep, float %val seq_cst
+ %unused = atomicrmw fadd ptr %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
ret void
}
-; --------------------------------------------------------------------
-; double
-; --------------------------------------------------------------------
-
-define double @flat_agent_atomic_fadd_ret_f64(ptr %ptr, double %val) #0 {
-; GFX12-LABEL: flat_agent_atomic_fadd_ret_f64:
+define void @flat_agent_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory(ptr %ptr, float %val) #1 {
+; GFX12-LABEL: flat_agent_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: s_wait_expcnt 0x0
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: flat_load_b64 v[4:5], v[0:1]
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: .LBB16_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_add_f64_e32 v[4:5], v[6:7], v[2:3]
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: flat_atomic_add_f32 v[0:1], v2 offset:2044
+; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB16_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
-; GFX940-LABEL: flat_agent_atomic_fadd_ret_f64:
+; GFX940-LABEL: flat_agent_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: flat_atomic_add_f64 v[0:1], v[0:1], v[2:3] sc0
+; GFX940-NEXT: flat_atomic_add_f32 v[0:1], v2 offset:2044
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_inv sc1
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: flat_agent_atomic_fadd_ret_f64:
+; GFX11-LABEL: flat_agent_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: flat_load_b64 v[4:5], v[0:1]
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: .LBB16_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3]
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] glc
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT: flat_atomic_add_f32 v[0:1], v2 offset:2044
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB16_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
-; GFX10-LABEL: flat_agent_atomic_fadd_ret_f64:
+; GFX10-LABEL: flat_agent_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: flat_load_dwordx2 v[4:5], v[0:1]
+; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fc, v0
+; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX10-NEXT: s_mov_b32 s4, 0
+; GFX10-NEXT: flat_load_dword v4, v[0:1]
; GFX10-NEXT: .LBB16_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v7, v5
-; GFX10-NEXT: v_mov_b32_e32 v6, v4
-; GFX10-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3]
+; GFX10-NEXT: v_add_f32_e32 v3, v4, v2
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
+; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX10-NEXT: v_mov_b32_e32 v4, v3
; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_cbranch_execnz .LBB16_1
; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: v_mov_b32_e32 v0, v4
-; GFX10-NEXT: v_mov_b32_e32 v1, v5
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX90A-LABEL: flat_agent_atomic_fadd_ret_f64:
-; GFX90A: ; %bb.0:
+; GFX90A-LABEL: flat_agent_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
+; GFX90A: ; %bb.0: ; %atomicrmw.check.shared
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: flat_atomic_add_f64 v[0:1], v[0:1], v[2:3] glc
-; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0x7fc, v0
+; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX90A-NEXT: s_mov_b64 s[4:5], src_shared_base
+; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1
+; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
+; GFX90A-NEXT: s_cbranch_execnz .LBB16_3
+; GFX90A-NEXT: ; %bb.1: ; %Flow2
+; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
+; GFX90A-NEXT: s_cbranch_execnz .LBB16_8
+; GFX90A-NEXT: .LBB16_2: ; %atomicrmw.phi
+; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: s_setpc_b64 s[30:31]
+; GFX90A-NEXT: .LBB16_3: ; %atomicrmw.check.private
+; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base
+; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s7, v1
+; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc
+; GFX90A-NEXT: s_xor_b64 s[6:7], exec, s[6:7]
+; GFX90A-NEXT: s_cbranch_execz .LBB16_5
+; GFX90A-NEXT: ; %bb.4: ; %atomicrmw.global
+; GFX90A-NEXT: global_atomic_add_f32 v[0:1], v2, off
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
+; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX90A-NEXT: ; implicit-def: $vgpr2
+; GFX90A-NEXT: .LBB16_5: ; %Flow
+; GFX90A-NEXT: s_andn2_saveexec_b64 s[6:7], s[6:7]
+; GFX90A-NEXT: s_cbranch_execz .LBB16_7
+; GFX90A-NEXT: ; %bb.6: ; %atomicrmw.private
+; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
+; GFX90A-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: v_add_f32_e32 v1, v1, v2
+; GFX90A-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
+; GFX90A-NEXT: .LBB16_7: ; %Flow1
+; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX90A-NEXT: ; implicit-def: $vgpr2
+; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
+; GFX90A-NEXT: s_cbranch_execz .LBB16_2
+; GFX90A-NEXT: .LBB16_8: ; %atomicrmw.shared
+; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
+; GFX90A-NEXT: ds_add_f32 v0, v2
+; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
-; GFX908-LABEL: flat_agent_atomic_fadd_ret_f64:
-; GFX908: ; %bb.0:
+; GFX908-LABEL: flat_agent_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
+; GFX908: ; %bb.0: ; %atomicrmw.check.shared
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: flat_load_dwordx2 v[4:5], v[0:1]
-; GFX908-NEXT: s_mov_b64 s[4:5], 0
-; GFX908-NEXT: .LBB16_1: ; %atomicrmw.start
-; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v7, v5
-; GFX908-NEXT: v_mov_b32_e32 v6, v4
-; GFX908-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3]
-; GFX908-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
-; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX908-NEXT: v_add_co_u32_e32 v0, vcc, 0x7fc, v0
+; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX908-NEXT: s_mov_b64 s[4:5], src_shared_base
+; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1
+; GFX908-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX908-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
+; GFX908-NEXT: s_cbranch_execnz .LBB16_3
+; GFX908-NEXT: ; %bb.1: ; %Flow2
+; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
+; GFX908-NEXT: s_cbranch_execnz .LBB16_8
+; GFX908-NEXT: .LBB16_2: ; %atomicrmw.phi
+; GFX908-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: s_setpc_b64 s[30:31]
+; GFX908-NEXT: .LBB16_3: ; %atomicrmw.check.private
+; GFX908-NEXT: s_mov_b64 s[6:7], src_private_base
+; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s7, v1
+; GFX908-NEXT: s_and_saveexec_b64 s[6:7], vcc
+; GFX908-NEXT: s_xor_b64 s[6:7], exec, s[6:7]
+; GFX908-NEXT: s_cbranch_execz .LBB16_5
+; GFX908-NEXT: ; %bb.4: ; %atomicrmw.global
+; GFX908-NEXT: global_atomic_add_f32 v[0:1], v2, off
+; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
-; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT: s_cbranch_execnz .LBB16_1
-; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX908-NEXT: ; implicit-def: $vgpr2
+; GFX908-NEXT: .LBB16_5: ; %Flow
+; GFX908-NEXT: s_andn2_saveexec_b64 s[6:7], s[6:7]
+; GFX908-NEXT: s_cbranch_execz .LBB16_7
+; GFX908-NEXT: ; %bb.6: ; %atomicrmw.private
+; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; GFX908-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
+; GFX908-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: v_add_f32_e32 v1, v1, v2
+; GFX908-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
+; GFX908-NEXT: .LBB16_7: ; %Flow1
+; GFX908-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX908-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX908-NEXT: ; implicit-def: $vgpr2
+; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
+; GFX908-NEXT: s_cbranch_execz .LBB16_2
+; GFX908-NEXT: .LBB16_8: ; %atomicrmw.shared
+; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; GFX908-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
+; GFX908-NEXT: ds_add_f32 v0, v2
+; GFX908-NEXT: s_waitcnt lgkmcnt(0)
; GFX908-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX908-NEXT: v_mov_b32_e32 v0, v4
-; GFX908-NEXT: v_mov_b32_e32 v1, v5
+; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
-; GFX8-LABEL: flat_agent_atomic_fadd_ret_f64:
+; GFX8-LABEL: flat_agent_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_add_u32_e32 v5, vcc, 4, v0
-; GFX8-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v4, v[0:1]
-; GFX8-NEXT: flat_load_dword v5, v[5:6]
; GFX8-NEXT: s_mov_b64 s[4:5], 0
; GFX8-NEXT: .LBB16_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v7, v5
-; GFX8-NEXT: v_mov_b32_e32 v6, v4
-; GFX8-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3]
-; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
+; GFX8-NEXT: v_add_f32_e32 v3, v4, v2
+; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT: v_mov_b32_e32 v4, v3
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB16_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX8-NEXT: v_mov_b32_e32 v0, v4
-; GFX8-NEXT: v_mov_b32_e32 v1, v5
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX7-LABEL: flat_agent_atomic_fadd_ret_f64:
+; GFX7-LABEL: flat_agent_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_add_i32_e32 v5, vcc, 4, v0
-; GFX7-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc
+; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7fc, v0
+; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX7-NEXT: flat_load_dword v4, v[0:1]
-; GFX7-NEXT: flat_load_dword v5, v[5:6]
; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: .LBB16_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v7, v5
-; GFX7-NEXT: v_mov_b32_e32 v6, v4
-; GFX7-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3]
-; GFX7-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
+; GFX7-NEXT: v_add_f32_e32 v3, v4, v2
+; GFX7-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT: v_mov_b32_e32 v4, v3
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_cbranch_execnz .LBB16_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX7-NEXT: v_mov_b32_e32 v0, v4
-; GFX7-NEXT: v_mov_b32_e32 v1, v5
; GFX7-NEXT: s_setpc_b64 s[30:31]
- %result = atomicrmw fadd ptr %ptr, double %val syncscope("agent") seq_cst
- ret double %result
+ %gep = getelementptr float, ptr %ptr, i64 511
+ %unused = atomicrmw fadd ptr %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+ ret void
}
-define double @flat_agent_atomic_fadd_ret_f64__offset12b_pos(ptr %ptr, double %val) #0 {
-; GFX12-LABEL: flat_agent_atomic_fadd_ret_f64__offset12b_pos:
+define void @flat_agent_atomic_fadd_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory(ptr %ptr, float %val) #1 {
+; GFX12-LABEL: flat_agent_atomic_fadd_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: s_wait_expcnt 0x0
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: flat_load_b64 v[4:5], v[0:1] offset:2040
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: .LBB17_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_add_f64_e32 v[4:5], v[6:7], v[2:3]
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] offset:2040 th:TH_ATOMIC_RETURN
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: flat_atomic_add_f32 v[0:1], v2 offset:-2048
+; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB17_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
-; GFX940-LABEL: flat_agent_atomic_fadd_ret_f64__offset12b_pos:
+; GFX940-LABEL: flat_agent_atomic_fadd_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
+; GFX940-NEXT: s_nop 1
+; GFX940-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: flat_atomic_add_f64 v[0:1], v[0:1], v[2:3] offset:2040 sc0
+; GFX940-NEXT: flat_atomic_add_f32 v[0:1], v2
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_inv sc1
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: flat_agent_atomic_fadd_ret_f64__offset12b_pos:
+; GFX11-LABEL: flat_agent_atomic_fadd_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: flat_load_b64 v[4:5], v[0:1] offset:2040
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: .LBB17_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3]
+; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
+; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: flat_atomic_add_f32 v[0:1], v2
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] offset:2040 glc
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB17_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
-; GFX10-LABEL: flat_agent_atomic_fadd_ret_f64__offset12b_pos:
+; GFX10-LABEL: flat_agent_atomic_fadd_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, 0x7f8, v0
-; GFX10-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo
+; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
+; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: flat_load_dwordx2 v[0:1], v[4:5]
+; GFX10-NEXT: flat_load_dword v4, v[0:1]
; GFX10-NEXT: .LBB17_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v9, v1
-; GFX10-NEXT: v_mov_b32_e32 v8, v0
-; GFX10-NEXT: v_add_f64 v[6:7], v[8:9], v[2:3]
+; GFX10-NEXT: v_add_f32_e32 v3, v4, v2
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
+; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9]
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX10-NEXT: v_mov_b32_e32 v4, v3
; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_cbranch_execnz .LBB17_1
@@ -3425,188 +3455,247 @@ define double @flat_agent_atomic_fadd_ret_f64__offset12b_pos(ptr %ptr, double %v
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX90A-LABEL: flat_agent_atomic_fadd_ret_f64__offset12b_pos:
-; GFX90A: ; %bb.0:
+; GFX90A-LABEL: flat_agent_atomic_fadd_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
+; GFX90A: ; %bb.0: ; %atomicrmw.check.shared
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: flat_atomic_add_f64 v[0:1], v[0:1], v[2:3] offset:2040 glc
-; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
+; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX90A-NEXT: s_mov_b64 s[4:5], src_shared_base
+; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1
+; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
+; GFX90A-NEXT: s_cbranch_execnz .LBB17_3
+; GFX90A-NEXT: ; %bb.1: ; %Flow2
+; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
+; GFX90A-NEXT: s_cbranch_execnz .LBB17_8
+; GFX90A-NEXT: .LBB17_2: ; %atomicrmw.phi
+; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: s_setpc_b64 s[30:31]
+; GFX90A-NEXT: .LBB17_3: ; %atomicrmw.check.private
+; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base
+; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s7, v1
+; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc
+; GFX90A-NEXT: s_xor_b64 s[6:7], exec, s[6:7]
+; GFX90A-NEXT: s_cbranch_execz .LBB17_5
+; GFX90A-NEXT: ; %bb.4: ; %atomicrmw.global
+; GFX90A-NEXT: global_atomic_add_f32 v[0:1], v2, off
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
+; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX90A-NEXT: ; implicit-def: $vgpr2
+; GFX90A-NEXT: .LBB17_5: ; %Flow
+; GFX90A-NEXT: s_andn2_saveexec_b64 s[6:7], s[6:7]
+; GFX90A-NEXT: s_cbranch_execz .LBB17_7
+; GFX90A-NEXT: ; %bb.6: ; %atomicrmw.private
+; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
+; GFX90A-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: v_add_f32_e32 v1, v1, v2
+; GFX90A-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
+; GFX90A-NEXT: .LBB17_7: ; %Flow1
+; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX90A-NEXT: ; implicit-def: $vgpr2
+; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
+; GFX90A-NEXT: s_cbranch_execz .LBB17_2
+; GFX90A-NEXT: .LBB17_8: ; %atomicrmw.shared
+; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
+; GFX90A-NEXT: ds_add_f32 v0, v2
+; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
-; GFX908-LABEL: flat_agent_atomic_fadd_ret_f64__offset12b_pos:
-; GFX908: ; %bb.0:
+; GFX908-LABEL: flat_agent_atomic_fadd_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
+; GFX908: ; %bb.0: ; %atomicrmw.check.shared
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: flat_load_dwordx2 v[4:5], v[0:1] offset:2040
-; GFX908-NEXT: s_mov_b64 s[4:5], 0
-; GFX908-NEXT: .LBB17_1: ; %atomicrmw.start
-; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v7, v5
-; GFX908-NEXT: v_mov_b32_e32 v6, v4
-; GFX908-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3]
-; GFX908-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:2040 glc
-; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX908-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
+; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX908-NEXT: s_mov_b64 s[4:5], src_shared_base
+; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1
+; GFX908-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX908-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
+; GFX908-NEXT: s_cbranch_execnz .LBB17_3
+; GFX908-NEXT: ; %bb.1: ; %Flow2
+; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
+; GFX908-NEXT: s_cbranch_execnz .LBB17_8
+; GFX908-NEXT: .LBB17_2: ; %atomicrmw.phi
+; GFX908-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: s_setpc_b64 s[30:31]
+; GFX908-NEXT: .LBB17_3: ; %atomicrmw.check.private
+; GFX908-NEXT: s_mov_b64 s[6:7], src_private_base
+; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s7, v1
+; GFX908-NEXT: s_and_saveexec_b64 s[6:7], vcc
+; GFX908-NEXT: s_xor_b64 s[6:7], exec, s[6:7]
+; GFX908-NEXT: s_cbranch_execz .LBB17_5
+; GFX908-NEXT: ; %bb.4: ; %atomicrmw.global
+; GFX908-NEXT: global_atomic_add_f32 v[0:1], v2, off
+; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
-; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT: s_cbranch_execnz .LBB17_1
-; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX908-NEXT: ; implicit-def: $vgpr2
+; GFX908-NEXT: .LBB17_5: ; %Flow
+; GFX908-NEXT: s_andn2_saveexec_b64 s[6:7], s[6:7]
+; GFX908-NEXT: s_cbranch_execz .LBB17_7
+; GFX908-NEXT: ; %bb.6: ; %atomicrmw.private
+; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; GFX908-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
+; GFX908-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: v_add_f32_e32 v1, v1, v2
+; GFX908-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
+; GFX908-NEXT: .LBB17_7: ; %Flow1
+; GFX908-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX908-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX908-NEXT: ; implicit-def: $vgpr2
+; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
+; GFX908-NEXT: s_cbranch_execz .LBB17_2
+; GFX908-NEXT: .LBB17_8: ; %atomicrmw.shared
+; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; GFX908-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
+; GFX908-NEXT: ds_add_f32 v0, v2
+; GFX908-NEXT: s_waitcnt lgkmcnt(0)
; GFX908-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX908-NEXT: v_mov_b32_e32 v0, v4
-; GFX908-NEXT: v_mov_b32_e32 v1, v5
+; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
-; GFX8-LABEL: flat_agent_atomic_fadd_ret_f64__offset12b_pos:
+; GFX8-LABEL: flat_agent_atomic_fadd_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7f8, v0
-; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0
-; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_load_dword v1, v[0:1]
-; GFX8-NEXT: flat_load_dword v0, v[4:5]
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0xfffff800, v0
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc
+; GFX8-NEXT: flat_load_dword v4, v[0:1]
; GFX8-NEXT: s_mov_b64 s[4:5], 0
; GFX8-NEXT: .LBB17_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v9, v1
-; GFX8-NEXT: v_mov_b32_e32 v8, v0
-; GFX8-NEXT: v_add_f64 v[6:7], v[8:9], v[2:3]
-; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
+; GFX8-NEXT: v_add_f32_e32 v3, v4, v2
+; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT: v_mov_b32_e32 v4, v3
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB17_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX7-LABEL: flat_agent_atomic_fadd_ret_f64__offset12b_pos:
+; GFX7-LABEL: flat_agent_atomic_fadd_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x7f8, v0
-; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
-; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7fc, v0
-; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX7-NEXT: flat_load_dword v1, v[0:1]
-; GFX7-NEXT: flat_load_dword v0, v[4:5]
+; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0
+; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc
+; GFX7-NEXT: flat_load_dword v4, v[0:1]
; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: .LBB17_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v9, v1
-; GFX7-NEXT: v_mov_b32_e32 v8, v0
-; GFX7-NEXT: v_add_f64 v[6:7], v[8:9], v[2:3]
-; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
+; GFX7-NEXT: v_add_f32_e32 v3, v4, v2
+; GFX7-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT: v_mov_b32_e32 v4, v3
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_cbranch_execnz .LBB17_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_setpc_b64 s[30:31]
- %gep = getelementptr double, ptr %ptr, i64 255
- %result = atomicrmw fadd ptr %gep, double %val syncscope("agent") seq_cst
- ret double %result
+ %gep = getelementptr float, ptr %ptr, i64 -512
+ %unused = atomicrmw fadd ptr %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+ ret void
}
-define double @flat_agent_atomic_fadd_ret_f64__offset12b_neg(ptr %ptr, double %val) #0 {
-; GFX12-LABEL: flat_agent_atomic_fadd_ret_f64__offset12b_neg:
+define float @flat_system_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory(ptr %ptr, float %val) #1 {
+; GFX12-LABEL: flat_system_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: s_wait_expcnt 0x0
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: flat_load_b64 v[4:5], v[0:1] offset:-2048
+; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2044
; GFX12-NEXT: s_mov_b32 s0, 0
; GFX12-NEXT: .LBB18_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
+; GFX12-NEXT: v_mov_b32_e32 v4, v3
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_add_f64_e32 v[4:5], v[6:7], v[2:3]
+; GFX12-NEXT: v_add_f32_e32 v3, v4, v2
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] offset:-2048 th:TH_ATOMIC_RETURN
+; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX12-NEXT: global_inv scope:SCOPE_SYS
+; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB18_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX12-NEXT: v_mov_b32_e32 v0, v3
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
-; GFX940-LABEL: flat_agent_atomic_fadd_ret_f64__offset12b_neg:
+; GFX940-LABEL: flat_system_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
-; GFX940-NEXT: s_nop 1
-; GFX940-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
-; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: flat_atomic_add_f64 v[0:1], v[0:1], v[2:3] sc0
+; GFX940-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 offset:2044 sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT: buffer_inv sc1
+; GFX940-NEXT: buffer_inv sc0 sc1
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: flat_agent_atomic_fadd_ret_f64__offset12b_neg:
+; GFX11-LABEL: flat_system_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v0
+; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2044
; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v5
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v4, vcc_lo
-; GFX11-NEXT: v_add_co_u32 v5, vcc_lo, 0xfffff800, v5
-; GFX11-NEXT: v_add_co_ci_u32_e32 v6, vcc_lo, -1, v4, vcc_lo
-; GFX11-NEXT: flat_load_b64 v[0:1], v[0:1]
; GFX11-NEXT: .LBB18_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0
+; GFX11-NEXT: v_mov_b32_e32 v4, v3
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_add_f64 v[7:8], v[9:10], v[2:3]
+; GFX11-NEXT: v_add_f32_e32 v3, v4, v2
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[5:6], v[7:10] glc
+; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 glc
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10]
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: s_cbranch_execnz .LBB18_1
; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT: v_mov_b32_e32 v0, v3
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
-; GFX10-LABEL: flat_agent_atomic_fadd_ret_f64__offset12b_neg:
+; GFX10-LABEL: flat_system_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
-; GFX10-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, -1, v1, vcc_lo
+; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fc, v0
+; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, 0, v1, vcc_lo
; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: flat_load_dwordx2 v[0:1], v[4:5]
+; GFX10-NEXT: flat_load_dword v0, v[3:4]
; GFX10-NEXT: .LBB18_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v9, v1
-; GFX10-NEXT: v_mov_b32_e32 v8, v0
-; GFX10-NEXT: v_add_f64 v[6:7], v[8:9], v[2:3]
+; GFX10-NEXT: v_mov_b32_e32 v1, v0
+; GFX10-NEXT: v_add_f32_e32 v0, v1, v2
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
+; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9]
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_cbranch_execnz .LBB18_1
@@ -3614,62 +3703,68 @@ define double @flat_agent_atomic_fadd_ret_f64__offset12b_neg(ptr %ptr, double %v
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX90A-LABEL: flat_agent_atomic_fadd_ret_f64__offset12b_neg:
+; GFX90A-LABEL: flat_system_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
-; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
-; GFX90A-NEXT: flat_atomic_add_f64 v[0:1], v[0:1], v[2:3] glc
+; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044
+; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: .LBB18_1: ; %atomicrmw.start
+; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v5, v3
+; GFX90A-NEXT: v_add_f32_e32 v4, v5, v2
+; GFX90A-NEXT: buffer_wbl2
+; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc
+; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: buffer_invl2
; GFX90A-NEXT: buffer_wbinvl1
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: s_cbranch_execnz .LBB18_1
+; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v3
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
-; GFX908-LABEL: flat_agent_atomic_fadd_ret_f64__offset12b_neg:
+; GFX908-LABEL: flat_system_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
-; GFX908-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc
-; GFX908-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
-; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
-; GFX908-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
+; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044
; GFX908-NEXT: s_mov_b64 s[4:5], 0
; GFX908-NEXT: .LBB18_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v9, v1
-; GFX908-NEXT: v_mov_b32_e32 v8, v0
-; GFX908-NEXT: v_add_f64 v[6:7], v[8:9], v[2:3]
-; GFX908-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
+; GFX908-NEXT: v_mov_b32_e32 v4, v3
+; GFX908-NEXT: v_add_f32_e32 v3, v4, v2
+; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB18_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX908-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT: v_mov_b32_e32 v0, v3
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
-; GFX8-LABEL: flat_agent_atomic_fadd_ret_f64__offset12b_neg:
+; GFX8-LABEL: flat_system_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0xfffff800, v0
-; GFX8-NEXT: v_addc_u32_e32 v5, vcc, -1, v1, vcc
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0xfffff804, v0
-; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc
-; GFX8-NEXT: flat_load_dword v1, v[0:1]
-; GFX8-NEXT: flat_load_dword v0, v[4:5]
+; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fc, v0
+; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc
+; GFX8-NEXT: flat_load_dword v0, v[3:4]
; GFX8-NEXT: s_mov_b64 s[4:5], 0
; GFX8-NEXT: .LBB18_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v9, v1
-; GFX8-NEXT: v_mov_b32_e32 v8, v0
-; GFX8-NEXT: v_add_f64 v[6:7], v[8:9], v[2:3]
-; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
+; GFX8-NEXT: v_mov_b32_e32 v1, v0
+; GFX8-NEXT: v_add_f32_e32 v0, v1, v2
+; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB18_1
@@ -3677,57 +3772,53 @@ define double @flat_agent_atomic_fadd_ret_f64__offset12b_neg(ptr %ptr, double %v
; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX7-LABEL: flat_agent_atomic_fadd_ret_f64__offset12b_neg:
+; GFX7-LABEL: flat_system_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0xfffff800, v0
-; GFX7-NEXT: v_addc_u32_e32 v5, vcc, -1, v1, vcc
-; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0xfffff804, v0
-; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc
-; GFX7-NEXT: flat_load_dword v1, v[0:1]
-; GFX7-NEXT: flat_load_dword v0, v[4:5]
+; GFX7-NEXT: v_add_i32_e32 v3, vcc, 0x7fc, v0
+; GFX7-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc
+; GFX7-NEXT: flat_load_dword v0, v[3:4]
; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: .LBB18_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v9, v1
-; GFX7-NEXT: v_mov_b32_e32 v8, v0
-; GFX7-NEXT: v_add_f64 v[6:7], v[8:9], v[2:3]
-; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
+; GFX7-NEXT: v_mov_b32_e32 v1, v0
+; GFX7-NEXT: v_add_f32_e32 v0, v1, v2
+; GFX7-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_cbranch_execnz .LBB18_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_setpc_b64 s[30:31]
- %gep = getelementptr double, ptr %ptr, i64 -256
- %result = atomicrmw fadd ptr %gep, double %val syncscope("agent") seq_cst
- ret double %result
+ %gep = getelementptr float, ptr %ptr, i64 511
+ %result = atomicrmw fadd ptr %gep, float %val seq_cst, !amdgpu.no.fine.grained.memory !0
+ ret float %result
}
-define void @flat_agent_atomic_fadd_noret_f64(ptr %ptr, double %val) #0 {
-; GFX12-LABEL: flat_agent_atomic_fadd_noret_f64:
+define void @flat_system_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory(ptr %ptr, float %val) #1 {
+; GFX12-LABEL: flat_system_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: s_wait_expcnt 0x0
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: flat_load_b64 v[6:7], v[0:1]
+; GFX12-NEXT: flat_load_b32 v4, v[0:1] offset:2044
; GFX12-NEXT: s_mov_b32 s0, 0
; GFX12-NEXT: .LBB19_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_add_f64_e32 v[4:5], v[6:7], v[2:3]
+; GFX12-NEXT: v_add_f32_e32 v3, v4, v2
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN
+; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
-; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
+; GFX12-NEXT: global_inv scope:SCOPE_SYS
+; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT: v_mov_b32_e32 v4, v3
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
@@ -3736,31 +3827,31 @@ define void @flat_agent_atomic_fadd_noret_f64(ptr %ptr, double %val) #0 {
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
-; GFX940-LABEL: flat_agent_atomic_fadd_noret_f64:
+; GFX940-LABEL: flat_system_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: flat_atomic_add_f64 v[0:1], v[2:3]
+; GFX940-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-NEXT: flat_atomic_add_f32 v[0:1], v2 offset:2044 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT: buffer_inv sc1
+; GFX940-NEXT: buffer_inv sc0 sc1
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: flat_agent_atomic_fadd_noret_f64:
+; GFX11-LABEL: flat_system_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: flat_load_b64 v[6:7], v[0:1]
+; GFX11-NEXT: flat_load_b32 v4, v[0:1] offset:2044
; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: .LBB19_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3]
+; GFX11-NEXT: v_add_f32_e32 v3, v4, v2
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] glc
+; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 glc
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
-; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-NEXT: v_mov_b32_e32 v4, v3
; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
@@ -3769,23 +3860,24 @@ define void @flat_agent_atomic_fadd_noret_f64(ptr %ptr, double %val) #0 {
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
-; GFX10-LABEL: flat_agent_atomic_fadd_noret_f64:
+; GFX10-LABEL: flat_system_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: flat_load_dwordx2 v[6:7], v[0:1]
+; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fc, v0
+; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX10-NEXT: s_mov_b32 s4, 0
+; GFX10-NEXT: flat_load_dword v4, v[0:1]
; GFX10-NEXT: .LBB19_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3]
+; GFX10-NEXT: v_add_f32_e32 v3, v4, v2
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
+; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
-; GFX10-NEXT: v_mov_b32_e32 v7, v5
-; GFX10-NEXT: v_mov_b32_e32 v6, v4
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX10-NEXT: v_mov_b32_e32 v4, v3
; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_cbranch_execnz .LBB19_1
@@ -3793,169 +3885,183 @@ define void @flat_agent_atomic_fadd_noret_f64(ptr %ptr, double %val) #0 {
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX90A-LABEL: flat_agent_atomic_fadd_noret_f64:
+; GFX90A-LABEL: flat_system_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: flat_atomic_add_f64 v[0:1], v[2:3]
+; GFX90A-NEXT: flat_load_dword v5, v[0:1] offset:2044
+; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: .LBB19_1: ; %atomicrmw.start
+; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_add_f32_e32 v4, v5, v2
+; GFX90A-NEXT: buffer_wbl2
+; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: buffer_invl2
; GFX90A-NEXT: buffer_wbinvl1
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v5, v3
+; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: s_cbranch_execnz .LBB19_1
+; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
-; GFX908-LABEL: flat_agent_atomic_fadd_noret_f64:
+; GFX908-LABEL: flat_system_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: flat_load_dwordx2 v[6:7], v[0:1]
+; GFX908-NEXT: flat_load_dword v4, v[0:1] offset:2044
; GFX908-NEXT: s_mov_b64 s[4:5], 0
; GFX908-NEXT: .LBB19_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3]
-; GFX908-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
+; GFX908-NEXT: v_add_f32_e32 v3, v4, v2
+; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
-; GFX908-NEXT: v_mov_b32_e32 v7, v5
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT: v_mov_b32_e32 v6, v4
+; GFX908-NEXT: v_mov_b32_e32 v4, v3
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB19_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX908-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
-; GFX8-LABEL: flat_agent_atomic_fadd_noret_f64:
+; GFX8-LABEL: flat_system_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_add_u32_e32 v4, vcc, 4, v0
-; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_load_dword v6, v[0:1]
-; GFX8-NEXT: flat_load_dword v7, v[4:5]
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT: flat_load_dword v4, v[0:1]
; GFX8-NEXT: s_mov_b64 s[4:5], 0
; GFX8-NEXT: .LBB19_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3]
-; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
+; GFX8-NEXT: v_add_f32_e32 v3, v4, v2
+; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
-; GFX8-NEXT: v_mov_b32_e32 v7, v5
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT: v_mov_b32_e32 v6, v4
+; GFX8-NEXT: v_mov_b32_e32 v4, v3
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB19_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX7-LABEL: flat_agent_atomic_fadd_noret_f64:
+; GFX7-LABEL: flat_system_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_add_i32_e32 v4, vcc, 4, v0
-; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
-; GFX7-NEXT: flat_load_dword v6, v[0:1]
-; GFX7-NEXT: flat_load_dword v7, v[4:5]
+; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7fc, v0
+; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX7-NEXT: flat_load_dword v4, v[0:1]
; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: .LBB19_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3]
-; GFX7-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
+; GFX7-NEXT: v_add_f32_e32 v3, v4, v2
+; GFX7-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
-; GFX7-NEXT: v_mov_b32_e32 v7, v5
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT: v_mov_b32_e32 v6, v4
+; GFX7-NEXT: v_mov_b32_e32 v4, v3
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_cbranch_execnz .LBB19_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_setpc_b64 s[30:31]
- %unused = atomicrmw fadd ptr %ptr, double %val syncscope("agent") seq_cst
+ %gep = getelementptr float, ptr %ptr, i64 511
+ %unused = atomicrmw fadd ptr %gep, float %val seq_cst, !amdgpu.no.fine.grained.memory !0
ret void
}
-define void @flat_agent_atomic_fadd_noret_f64__offset12b_pos(ptr %ptr, double %val) #0 {
-; GFX12-LABEL: flat_agent_atomic_fadd_noret_f64__offset12b_pos:
+define float @flat_agent_atomic_fadd_ret_f32__ieee__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode(ptr %ptr, float %val) #0 {
+; GFX12-LABEL: flat_agent_atomic_fadd_ret_f32__ieee__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: s_wait_expcnt 0x0
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: flat_load_b64 v[6:7], v[0:1] offset:2040
+; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2044
; GFX12-NEXT: s_mov_b32 s0, 0
; GFX12-NEXT: .LBB20_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_add_f64_e32 v[4:5], v[6:7], v[2:3]
+; GFX12-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_add_f32_e32 v3, v4, v2
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] offset:2040 th:TH_ATOMIC_RETURN
+; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
-; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
+; GFX12-NEXT: global_inv scope:SCOPE_SYS
+; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB20_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: v_mov_b32_e32 v0, v3
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
-; GFX940-LABEL: flat_agent_atomic_fadd_noret_f64__offset12b_pos:
+; GFX940-LABEL: flat_agent_atomic_fadd_ret_f32__ieee__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: flat_atomic_add_f64 v[0:1], v[2:3] offset:2040
+; GFX940-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 offset:2044 sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT: buffer_inv sc1
+; GFX940-NEXT: buffer_inv sc0 sc1
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: flat_agent_atomic_fadd_noret_f64__offset12b_pos:
+; GFX11-LABEL: flat_agent_atomic_fadd_ret_f32__ieee__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: flat_load_b64 v[6:7], v[0:1] offset:2040
+; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2044
; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: .LBB20_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3]
+; GFX11-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_add_f32_e32 v3, v4, v2
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] offset:2040 glc
+; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 glc
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
-; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: s_cbranch_execnz .LBB20_1
; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT: v_mov_b32_e32 v0, v3
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
-; GFX10-LABEL: flat_agent_atomic_fadd_noret_f64__offset12b_pos:
+; GFX10-LABEL: flat_agent_atomic_fadd_ret_f32__ieee__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x7f8, v0
-; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fc, v0
+; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, 0, v1, vcc_lo
; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: flat_load_dwordx2 v[6:7], v[0:1]
+; GFX10-NEXT: flat_load_dword v0, v[3:4]
; GFX10-NEXT: .LBB20_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3]
+; GFX10-NEXT: v_mov_b32_e32 v1, v0
+; GFX10-NEXT: v_add_f32_e32 v0, v1, v2
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
+; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
-; GFX10-NEXT: v_mov_b32_e32 v7, v5
-; GFX10-NEXT: v_mov_b32_e32 v6, v4
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_cbranch_execnz .LBB20_1
@@ -3963,114 +4069,122 @@ define void @flat_agent_atomic_fadd_noret_f64__offset12b_pos(ptr %ptr, double %v
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX90A-LABEL: flat_agent_atomic_fadd_noret_f64__offset12b_pos:
+; GFX90A-LABEL: flat_agent_atomic_fadd_ret_f32__ieee__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: flat_atomic_add_f64 v[0:1], v[2:3] offset:2040
+; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044
+; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: .LBB20_1: ; %atomicrmw.start
+; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v5, v3
+; GFX90A-NEXT: v_add_f32_e32 v4, v5, v2
+; GFX90A-NEXT: buffer_wbl2
+; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: buffer_invl2
; GFX90A-NEXT: buffer_wbinvl1
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: s_cbranch_execnz .LBB20_1
+; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v3
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
-; GFX908-LABEL: flat_agent_atomic_fadd_noret_f64__offset12b_pos:
+; GFX908-LABEL: flat_agent_atomic_fadd_ret_f32__ieee__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: flat_load_dwordx2 v[6:7], v[0:1] offset:2040
+; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044
; GFX908-NEXT: s_mov_b64 s[4:5], 0
; GFX908-NEXT: .LBB20_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3]
-; GFX908-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:2040 glc
+; GFX908-NEXT: v_mov_b32_e32 v4, v3
+; GFX908-NEXT: v_add_f32_e32 v3, v4, v2
+; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
-; GFX908-NEXT: v_mov_b32_e32 v7, v5
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT: v_mov_b32_e32 v6, v4
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB20_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX908-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT: v_mov_b32_e32 v0, v3
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
-; GFX8-LABEL: flat_agent_atomic_fadd_noret_f64__offset12b_pos:
+; GFX8-LABEL: flat_agent_atomic_fadd_ret_f32__ieee__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7f8, v0
-; GFX8-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0
-; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_load_dword v7, v[0:1]
-; GFX8-NEXT: flat_load_dword v6, v[8:9]
+; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fc, v0
+; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc
+; GFX8-NEXT: flat_load_dword v0, v[3:4]
; GFX8-NEXT: s_mov_b64 s[4:5], 0
; GFX8-NEXT: .LBB20_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3]
-; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc
+; GFX8-NEXT: v_mov_b32_e32 v1, v0
+; GFX8-NEXT: v_add_f32_e32 v0, v1, v2
+; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
-; GFX8-NEXT: v_mov_b32_e32 v7, v1
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT: v_mov_b32_e32 v6, v0
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB20_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX7-LABEL: flat_agent_atomic_fadd_noret_f64__offset12b_pos:
+; GFX7-LABEL: flat_agent_atomic_fadd_ret_f32__ieee__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_add_i32_e32 v8, vcc, 0x7f8, v0
-; GFX7-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc
-; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7fc, v0
-; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX7-NEXT: flat_load_dword v7, v[0:1]
-; GFX7-NEXT: flat_load_dword v6, v[8:9]
+; GFX7-NEXT: v_add_i32_e32 v3, vcc, 0x7fc, v0
+; GFX7-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc
+; GFX7-NEXT: flat_load_dword v0, v[3:4]
; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: .LBB20_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3]
-; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc
+; GFX7-NEXT: v_mov_b32_e32 v1, v0
+; GFX7-NEXT: v_add_f32_e32 v0, v1, v2
+; GFX7-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
-; GFX7-NEXT: v_mov_b32_e32 v7, v1
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT: v_mov_b32_e32 v6, v0
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_cbranch_execnz .LBB20_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_setpc_b64 s[30:31]
- %gep = getelementptr double, ptr %ptr, i64 255
- %unused = atomicrmw fadd ptr %gep, double %val syncscope("agent") seq_cst
- ret void
+ %gep = getelementptr float, ptr %ptr, i64 511
+ %result = atomicrmw fadd ptr %gep, float %val seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0
+ ret float %result
}
-define void @flat_agent_atomic_fadd_noret_f64__offset12b_neg(ptr %ptr, double %val) #0 {
-; GFX12-LABEL: flat_agent_atomic_fadd_noret_f64__offset12b_neg:
+define void @flat_agent_atomic_fadd_noret_f32__ieee__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode(ptr %ptr, float %val) #0 {
+; GFX12-LABEL: flat_agent_atomic_fadd_noret_f32__ieee__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: s_wait_expcnt 0x0
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: flat_load_b64 v[6:7], v[0:1] offset:-2048
+; GFX12-NEXT: flat_load_b32 v4, v[0:1] offset:2044
; GFX12-NEXT: s_mov_b32 s0, 0
; GFX12-NEXT: .LBB21_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_add_f64_e32 v[4:5], v[6:7], v[2:3]
+; GFX12-NEXT: v_add_f32_e32 v3, v4, v2
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] offset:-2048 th:TH_ATOMIC_RETURN
+; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
-; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
+; GFX12-NEXT: global_inv scope:SCOPE_SYS
+; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT: v_mov_b32_e32 v4, v3
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
@@ -4079,38 +4193,31 @@ define void @flat_agent_atomic_fadd_noret_f64__offset12b_neg(ptr %ptr, double %v
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
-; GFX940-LABEL: flat_agent_atomic_fadd_noret_f64__offset12b_neg:
+; GFX940-LABEL: flat_agent_atomic_fadd_noret_f32__ieee__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
-; GFX940-NEXT: s_nop 1
-; GFX940-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
-; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: flat_atomic_add_f64 v[0:1], v[2:3]
+; GFX940-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-NEXT: flat_atomic_add_f32 v[0:1], v2 offset:2044 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT: buffer_inv sc1
+; GFX940-NEXT: buffer_inv sc0 sc1
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: flat_agent_atomic_fadd_noret_f64__offset12b_neg:
+; GFX11-LABEL: flat_agent_atomic_fadd_noret_f32__ieee__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
-; GFX11-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, -1, v1, vcc_lo
-; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
-; GFX11-NEXT: flat_load_b64 v[6:7], v[4:5]
+; GFX11-NEXT: flat_load_b32 v4, v[0:1] offset:2044
; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: .LBB21_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3]
+; GFX11-NEXT: v_add_f32_e32 v3, v4, v2
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] glc
+; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 glc
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
-; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-NEXT: v_mov_b32_e32 v4, v3
; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
@@ -4119,25 +4226,24 @@ define void @flat_agent_atomic_fadd_noret_f64__offset12b_neg(ptr %ptr, double %v
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
-; GFX10-LABEL: flat_agent_atomic_fadd_noret_f64__offset12b_neg:
+; GFX10-LABEL: flat_agent_atomic_fadd_noret_f32__ieee__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
-; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fc, v0
+; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: flat_load_dwordx2 v[6:7], v[0:1]
+; GFX10-NEXT: flat_load_dword v4, v[0:1]
; GFX10-NEXT: .LBB21_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3]
+; GFX10-NEXT: v_add_f32_e32 v3, v4, v2
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
+; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
-; GFX10-NEXT: v_mov_b32_e32 v7, v5
-; GFX10-NEXT: v_mov_b32_e32 v6, v4
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX10-NEXT: v_mov_b32_e32 v4, v3
; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_cbranch_execnz .LBB21_1
@@ -4145,355 +4251,5148 @@ define void @flat_agent_atomic_fadd_noret_f64__offset12b_neg(ptr %ptr, double %v
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX90A-LABEL: flat_agent_atomic_fadd_noret_f64__offset12b_neg:
+; GFX90A-LABEL: flat_agent_atomic_fadd_noret_f32__ieee__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
-; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
-; GFX90A-NEXT: flat_atomic_add_f64 v[0:1], v[2:3]
+; GFX90A-NEXT: flat_load_dword v5, v[0:1] offset:2044
+; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: .LBB21_1: ; %atomicrmw.start
+; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_add_f32_e32 v4, v5, v2
+; GFX90A-NEXT: buffer_wbl2
+; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: buffer_invl2
; GFX90A-NEXT: buffer_wbinvl1
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v5, v3
+; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: s_cbranch_execnz .LBB21_1
+; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
-; GFX908-LABEL: flat_agent_atomic_fadd_noret_f64__offset12b_neg:
+; GFX908-LABEL: flat_agent_atomic_fadd_noret_f32__ieee__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_add_co_u32_e32 v8, vcc, 0xfffff800, v0
-; GFX908-NEXT: v_addc_co_u32_e32 v9, vcc, -1, v1, vcc
-; GFX908-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
-; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
-; GFX908-NEXT: flat_load_dwordx2 v[6:7], v[0:1]
+; GFX908-NEXT: flat_load_dword v4, v[0:1] offset:2044
; GFX908-NEXT: s_mov_b64 s[4:5], 0
; GFX908-NEXT: .LBB21_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3]
-; GFX908-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc
+; GFX908-NEXT: v_add_f32_e32 v3, v4, v2
+; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
-; GFX908-NEXT: v_mov_b32_e32 v7, v1
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT: v_mov_b32_e32 v6, v0
+; GFX908-NEXT: v_mov_b32_e32 v4, v3
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB21_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX908-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
-; GFX8-LABEL: flat_agent_atomic_fadd_noret_f64__offset12b_neg:
+; GFX8-LABEL: flat_agent_atomic_fadd_noret_f32__ieee__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0xfffff800, v0
-; GFX8-NEXT: v_addc_u32_e32 v9, vcc, -1, v1, vcc
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0xfffff804, v0
-; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc
-; GFX8-NEXT: flat_load_dword v7, v[0:1]
-; GFX8-NEXT: flat_load_dword v6, v[8:9]
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT: flat_load_dword v4, v[0:1]
; GFX8-NEXT: s_mov_b64 s[4:5], 0
; GFX8-NEXT: .LBB21_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3]
-; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc
+; GFX8-NEXT: v_add_f32_e32 v3, v4, v2
+; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
-; GFX8-NEXT: v_mov_b32_e32 v7, v1
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT: v_mov_b32_e32 v6, v0
+; GFX8-NEXT: v_mov_b32_e32 v4, v3
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB21_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX7-LABEL: flat_agent_atomic_fadd_noret_f64__offset12b_neg:
+; GFX7-LABEL: flat_agent_atomic_fadd_noret_f32__ieee__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_add_i32_e32 v8, vcc, 0xfffff800, v0
-; GFX7-NEXT: v_addc_u32_e32 v9, vcc, -1, v1, vcc
-; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0xfffff804, v0
-; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc
-; GFX7-NEXT: flat_load_dword v7, v[0:1]
-; GFX7-NEXT: flat_load_dword v6, v[8:9]
+; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7fc, v0
+; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX7-NEXT: flat_load_dword v4, v[0:1]
; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: .LBB21_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3]
-; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc
+; GFX7-NEXT: v_add_f32_e32 v3, v4, v2
+; GFX7-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
-; GFX7-NEXT: v_mov_b32_e32 v7, v1
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT: v_mov_b32_e32 v6, v0
+; GFX7-NEXT: v_mov_b32_e32 v4, v3
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_cbranch_execnz .LBB21_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_setpc_b64 s[30:31]
- %gep = getelementptr double, ptr %ptr, i64 -256
- %unused = atomicrmw fadd ptr %gep, double %val syncscope("agent") seq_cst
+ %gep = getelementptr float, ptr %ptr, i64 511
+ %unused = atomicrmw fadd ptr %gep, float %val seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0
ret void
}
-; --------------------------------------------------------------------
-; half
-; --------------------------------------------------------------------
-
-define half @flat_agent_atomic_fadd_ret_f16(ptr %ptr, half %val) #0 {
-; GFX12-LABEL: flat_agent_atomic_fadd_ret_f16:
+define float @flat_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode(ptr %ptr, float %val) #0 {
+; GFX12-LABEL: flat_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: s_wait_expcnt 0x0
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v3, v0
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX12-NEXT: flat_load_b32 v5, v[0:1]
-; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3
-; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_not_b32_e32 v4, v4
-; GFX12-NEXT: .LBB22_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v6, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6
-; GFX12-NEXT: v_add_f16_e32 v5, v5, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN
+; GFX12-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB22_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
-; GFX940-LABEL: flat_agent_atomic_fadd_ret_f16:
+; GFX940-LABEL: flat_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_mov_b32_e32 v3, v0
-; GFX940-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX940-NEXT: flat_load_dword v4, v[0:1]
-; GFX940-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3
-; GFX940-NEXT: s_mov_b32 s0, 0xffff
-; GFX940-NEXT: v_lshlrev_b32_e64 v5, v3, s0
-; GFX940-NEXT: v_not_b32_e32 v5, v5
-; GFX940-NEXT: s_mov_b64 s[0:1], 0
-; GFX940-NEXT: .LBB22_1: ; %atomicrmw.start
-; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_mov_b32_e32 v7, v4
-; GFX940-NEXT: v_lshrrev_b32_e32 v4, v3, v7
-; GFX940-NEXT: v_add_f16_e32 v4, v4, v2
-; GFX940-NEXT: v_lshlrev_b32_e32 v4, v3, v4
-; GFX940-NEXT: v_and_or_b32 v6, v7, v5, v4
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: flat_atomic_cmpswap v4, v[0:1], v[6:7] sc0
+; GFX940-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 sc0
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7
-; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT: s_cbranch_execnz .LBB22_1
-; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX940-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v4
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: flat_agent_atomic_fadd_ret_f16:
+; GFX11-LABEL: flat_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v3, v0
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX11-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX11-NEXT: flat_load_b32 v5, v[0:1]
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3
-; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_not_b32_e32 v4, v4
-; GFX11-NEXT: .LBB22_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v6, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6
-; GFX11-NEXT: v_add_f16_e32 v5, v5, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_and_or_b32 v5, v6, v4, v5
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc
+; GFX11-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 glc
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB22_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
-; GFX10-LABEL: flat_agent_atomic_fadd_ret_f16:
+; GFX10-LABEL: flat_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v3, v0
+; GFX10-NEXT: flat_load_dword v3, v[0:1]
; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX10-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX10-NEXT: flat_load_dword v5, v[0:1]
-; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3
-; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX10-NEXT: v_not_b32_e32 v4, v4
; GFX10-NEXT: .LBB22_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v6, v5
-; GFX10-NEXT: v_lshrrev_b32_e32 v5, v3, v6
-; GFX10-NEXT: v_add_f16_e32 v5, v5, v2
-; GFX10-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-; GFX10-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX10-NEXT: v_mov_b32_e32 v4, v3
+; GFX10-NEXT: v_add_f32_e32 v3, v4, v2
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
+; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_cbranch_execnz .LBB22_1
; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX10-NEXT: v_mov_b32_e32 v0, v3
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX90A-LABEL: flat_agent_atomic_fadd_ret_f16:
+; GFX90A-LABEL: flat_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v3, v0
-; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX90A-NEXT: flat_load_dword v4, v[0:1]
-; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3
-; GFX90A-NEXT: s_mov_b32 s4, 0xffff
-; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v3, s4
-; GFX90A-NEXT: v_not_b32_e32 v5, v5
-; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: .LBB22_1: ; %atomicrmw.start
-; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v7, v4
-; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v7
-; GFX90A-NEXT: v_add_f16_e32 v4, v4, v2
-; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4
-; GFX90A-NEXT: v_and_or_b32 v6, v7, v5, v4
-; GFX90A-NEXT: flat_atomic_cmpswap v4, v[0:1], v[6:7] glc
-; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: s_mov_b64 s[4:5], src_shared_base
+; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1
+; GFX90A-NEXT: ; implicit-def: $vgpr3
+; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
+; GFX90A-NEXT: s_cbranch_execz .LBB22_6
+; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.check.private
+; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base
+; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s7, v1
+; GFX90A-NEXT: ; implicit-def: $vgpr3
+; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc
+; GFX90A-NEXT: s_xor_b64 s[6:7], exec, s[6:7]
+; GFX90A-NEXT: s_cbranch_execz .LBB22_3
+; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.global
+; GFX90A-NEXT: global_atomic_add_f32 v3, v[0:1], v2, off glc
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7
-; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: s_cbranch_execnz .LBB22_1
-; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX90A-NEXT: ; implicit-def: $vgpr2
+; GFX90A-NEXT: .LBB22_3: ; %Flow
+; GFX90A-NEXT: s_andn2_saveexec_b64 s[6:7], s[6:7]
+; GFX90A-NEXT: s_cbranch_execz .LBB22_5
+; GFX90A-NEXT: ; %bb.4: ; %atomicrmw.private
+; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
+; GFX90A-NEXT: buffer_load_dword v3, v0, s[0:3], 0 offen
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: v_add_f32_e32 v1, v3, v2
+; GFX90A-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
+; GFX90A-NEXT: .LBB22_5: ; %Flow1
+; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX90A-NEXT: ; implicit-def: $vgpr2
+; GFX90A-NEXT: .LBB22_6: ; %Flow2
+; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
+; GFX90A-NEXT: s_cbranch_execz .LBB22_8
+; GFX90A-NEXT: ; %bb.7: ; %atomicrmw.shared
+; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
+; GFX90A-NEXT: ds_add_rtn_f32 v3, v0, v2
+; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT: .LBB22_8: ; %atomicrmw.phi
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v4
+; GFX90A-NEXT: v_mov_b32_e32 v0, v3
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
-; GFX908-LABEL: flat_agent_atomic_fadd_ret_f16:
+; GFX908-LABEL: flat_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v3, v0
-; GFX908-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX908-NEXT: flat_load_dword v4, v[0:1]
-; GFX908-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX908-NEXT: v_lshlrev_b32_e32 v3, 3, v3
-; GFX908-NEXT: s_mov_b32 s4, 0xffff
-; GFX908-NEXT: v_lshlrev_b32_e64 v5, v3, s4
-; GFX908-NEXT: v_not_b32_e32 v5, v5
+; GFX908-NEXT: flat_load_dword v3, v[0:1]
; GFX908-NEXT: s_mov_b64 s[4:5], 0
; GFX908-NEXT: .LBB22_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v7, v4
-; GFX908-NEXT: v_lshrrev_b32_e32 v4, v3, v7
-; GFX908-NEXT: v_add_f16_e32 v4, v4, v2
-; GFX908-NEXT: v_lshlrev_b32_e32 v4, v3, v4
-; GFX908-NEXT: v_and_or_b32 v6, v7, v5, v4
-; GFX908-NEXT: flat_atomic_cmpswap v4, v[0:1], v[6:7] glc
+; GFX908-NEXT: v_mov_b32_e32 v4, v3
+; GFX908-NEXT: v_add_f32_e32 v3, v4, v2
+; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB22_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX908-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v4
+; GFX908-NEXT: v_mov_b32_e32 v0, v3
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
-; GFX8-LABEL: flat_agent_atomic_fadd_ret_f16:
+; GFX8-LABEL: flat_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v3, v0
-; GFX8-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX8-NEXT: flat_load_dword v5, v[0:1]
-; GFX8-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX8-NEXT: v_lshlrev_b32_e32 v3, 3, v3
-; GFX8-NEXT: s_mov_b32 s4, 0xffff
-; GFX8-NEXT: v_lshlrev_b32_e64 v4, v3, s4
-; GFX8-NEXT: v_not_b32_e32 v4, v4
+; GFX8-NEXT: flat_load_dword v3, v[0:1]
; GFX8-NEXT: s_mov_b64 s[4:5], 0
; GFX8-NEXT: .LBB22_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v6, v5
-; GFX8-NEXT: v_lshrrev_b32_e32 v5, v3, v6
-; GFX8-NEXT: v_add_f16_e32 v5, v5, v2
-; GFX8-NEXT: v_and_b32_e32 v7, v6, v4
-; GFX8-NEXT: v_lshlrev_b32_e32 v5, v3, v5
-; GFX8-NEXT: v_or_b32_e32 v5, v7, v5
-; GFX8-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
+; GFX8-NEXT: v_mov_b32_e32 v4, v3
+; GFX8-NEXT: v_add_f32_e32 v3, v4, v2
+; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB22_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX8-NEXT: v_mov_b32_e32 v0, v3
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX7-LABEL: flat_agent_atomic_fadd_ret_f16:
+; GFX7-LABEL: flat_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v3, v0
+; GFX7-NEXT: flat_load_dword v3, v[0:1]
+; GFX7-NEXT: s_mov_b64 s[4:5], 0
+; GFX7-NEXT: .LBB22_1: ; %atomicrmw.start
+; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v4, v3
+; GFX7-NEXT: v_add_f32_e32 v3, v4, v2
+; GFX7-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: buffer_wbinvl1
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX7-NEXT: s_cbranch_execnz .LBB22_1
+; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX7-NEXT: v_mov_b32_e32 v0, v3
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+ %result = atomicrmw fadd ptr %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0
+ ret float %result
+}
+
+define void @flat_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode(ptr %ptr, float %val) #0 {
+; GFX12-LABEL: flat_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: s_wait_storecnt 0x0
+; GFX12-NEXT: flat_atomic_add_f32 v[0:1], v2
+; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: flat_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode:
+; GFX940: ; %bb.0:
+; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT: buffer_wbl2 sc1
+; GFX940-NEXT: flat_atomic_add_f32 v[0:1], v2
+; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NEXT: buffer_inv sc1
+; GFX940-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: flat_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: flat_atomic_add_f32 v[0:1], v2
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: buffer_gl1_inv
+; GFX11-NEXT: buffer_gl0_inv
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: flat_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: flat_load_dword v4, v[0:1]
+; GFX10-NEXT: s_mov_b32 s4, 0
+; GFX10-NEXT: .LBB23_1: ; %atomicrmw.start
+; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_add_f32_e32 v3, v4, v2
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT: buffer_gl1_inv
+; GFX10-NEXT: buffer_gl0_inv
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX10-NEXT: v_mov_b32_e32 v4, v3
+; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT: s_cbranch_execnz .LBB23_1
+; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: flat_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode:
+; GFX90A: ; %bb.0: ; %atomicrmw.check.shared
+; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: s_mov_b64 s[4:5], src_shared_base
+; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1
+; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
+; GFX90A-NEXT: s_cbranch_execnz .LBB23_3
+; GFX90A-NEXT: ; %bb.1: ; %Flow2
+; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
+; GFX90A-NEXT: s_cbranch_execnz .LBB23_8
+; GFX90A-NEXT: .LBB23_2: ; %atomicrmw.phi
+; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: s_setpc_b64 s[30:31]
+; GFX90A-NEXT: .LBB23_3: ; %atomicrmw.check.private
+; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base
+; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s7, v1
+; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc
+; GFX90A-NEXT: s_xor_b64 s[6:7], exec, s[6:7]
+; GFX90A-NEXT: s_cbranch_execz .LBB23_5
+; GFX90A-NEXT: ; %bb.4: ; %atomicrmw.global
+; GFX90A-NEXT: global_atomic_add_f32 v[0:1], v2, off
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: buffer_wbinvl1
+; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX90A-NEXT: ; implicit-def: $vgpr2
+; GFX90A-NEXT: .LBB23_5: ; %Flow
+; GFX90A-NEXT: s_andn2_saveexec_b64 s[6:7], s[6:7]
+; GFX90A-NEXT: s_cbranch_execz .LBB23_7
+; GFX90A-NEXT: ; %bb.6: ; %atomicrmw.private
+; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
+; GFX90A-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: v_add_f32_e32 v1, v1, v2
+; GFX90A-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
+; GFX90A-NEXT: .LBB23_7: ; %Flow1
+; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX90A-NEXT: ; implicit-def: $vgpr2
+; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
+; GFX90A-NEXT: s_cbranch_execz .LBB23_2
+; GFX90A-NEXT: .LBB23_8: ; %atomicrmw.shared
+; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
+; GFX90A-NEXT: ds_add_f32 v0, v2
+; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: flat_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode:
+; GFX908: ; %bb.0: ; %atomicrmw.check.shared
+; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT: s_mov_b64 s[4:5], src_shared_base
+; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1
+; GFX908-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX908-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
+; GFX908-NEXT: s_cbranch_execnz .LBB23_3
+; GFX908-NEXT: ; %bb.1: ; %Flow2
+; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
+; GFX908-NEXT: s_cbranch_execnz .LBB23_8
+; GFX908-NEXT: .LBB23_2: ; %atomicrmw.phi
+; GFX908-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: s_setpc_b64 s[30:31]
+; GFX908-NEXT: .LBB23_3: ; %atomicrmw.check.private
+; GFX908-NEXT: s_mov_b64 s[6:7], src_private_base
+; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s7, v1
+; GFX908-NEXT: s_and_saveexec_b64 s[6:7], vcc
+; GFX908-NEXT: s_xor_b64 s[6:7], exec, s[6:7]
+; GFX908-NEXT: s_cbranch_execz .LBB23_5
+; GFX908-NEXT: ; %bb.4: ; %atomicrmw.global
+; GFX908-NEXT: global_atomic_add_f32 v[0:1], v2, off
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: buffer_wbinvl1
+; GFX908-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX908-NEXT: ; implicit-def: $vgpr2
+; GFX908-NEXT: .LBB23_5: ; %Flow
+; GFX908-NEXT: s_andn2_saveexec_b64 s[6:7], s[6:7]
+; GFX908-NEXT: s_cbranch_execz .LBB23_7
+; GFX908-NEXT: ; %bb.6: ; %atomicrmw.private
+; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; GFX908-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
+; GFX908-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: v_add_f32_e32 v1, v1, v2
+; GFX908-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
+; GFX908-NEXT: .LBB23_7: ; %Flow1
+; GFX908-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX908-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX908-NEXT: ; implicit-def: $vgpr2
+; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
+; GFX908-NEXT: s_cbranch_execz .LBB23_2
+; GFX908-NEXT: .LBB23_8: ; %atomicrmw.shared
+; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; GFX908-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
+; GFX908-NEXT: ds_add_f32 v0, v2
+; GFX908-NEXT: s_waitcnt lgkmcnt(0)
+; GFX908-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: flat_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: flat_load_dword v4, v[0:1]
+; GFX8-NEXT: s_mov_b64 s[4:5], 0
+; GFX8-NEXT: .LBB23_1: ; %atomicrmw.start
+; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_add_f32_e32 v3, v4, v2
+; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT: buffer_wbinvl1
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT: v_mov_b32_e32 v4, v3
+; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX8-NEXT: s_cbranch_execnz .LBB23_1
+; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: flat_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: flat_load_dword v4, v[0:1]
+; GFX7-NEXT: s_mov_b64 s[4:5], 0
+; GFX7-NEXT: .LBB23_1: ; %atomicrmw.start
+; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_add_f32_e32 v3, v4, v2
+; GFX7-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: buffer_wbinvl1
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT: v_mov_b32_e32 v4, v3
+; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX7-NEXT: s_cbranch_execnz .LBB23_1
+; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+ %unused = atomicrmw fadd ptr %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0
+ ret void
+}
+
+define float @flat_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory(ptr %ptr, float %val) #0 {
+; GFX12-LABEL: flat_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: s_wait_storecnt 0x0
+; GFX12-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: flat_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory:
+; GFX940: ; %bb.0:
+; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT: buffer_wbl2 sc1
+; GFX940-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 sc0
+; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NEXT: buffer_inv sc1
+; GFX940-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: flat_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 glc
+; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT: buffer_gl1_inv
+; GFX11-NEXT: buffer_gl0_inv
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: flat_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: flat_load_dword v3, v[0:1]
+; GFX10-NEXT: s_mov_b32 s4, 0
+; GFX10-NEXT: .LBB24_1: ; %atomicrmw.start
+; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_mov_b32_e32 v4, v3
+; GFX10-NEXT: v_add_f32_e32 v3, v4, v2
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT: buffer_gl1_inv
+; GFX10-NEXT: buffer_gl0_inv
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT: s_cbranch_execnz .LBB24_1
+; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT: v_mov_b32_e32 v0, v3
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: flat_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory:
+; GFX90A: ; %bb.0:
+; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: s_mov_b64 s[4:5], src_shared_base
+; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1
+; GFX90A-NEXT: ; implicit-def: $vgpr3
+; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
+; GFX90A-NEXT: s_cbranch_execz .LBB24_6
+; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.check.private
+; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base
+; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s7, v1
+; GFX90A-NEXT: ; implicit-def: $vgpr3
+; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc
+; GFX90A-NEXT: s_xor_b64 s[6:7], exec, s[6:7]
+; GFX90A-NEXT: s_cbranch_execz .LBB24_3
+; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.global
+; GFX90A-NEXT: global_atomic_add_f32 v3, v[0:1], v2, off glc
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: buffer_wbinvl1
+; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX90A-NEXT: ; implicit-def: $vgpr2
+; GFX90A-NEXT: .LBB24_3: ; %Flow
+; GFX90A-NEXT: s_andn2_saveexec_b64 s[6:7], s[6:7]
+; GFX90A-NEXT: s_cbranch_execz .LBB24_5
+; GFX90A-NEXT: ; %bb.4: ; %atomicrmw.private
+; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
+; GFX90A-NEXT: buffer_load_dword v3, v0, s[0:3], 0 offen
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: v_add_f32_e32 v1, v3, v2
+; GFX90A-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
+; GFX90A-NEXT: .LBB24_5: ; %Flow1
+; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX90A-NEXT: ; implicit-def: $vgpr2
+; GFX90A-NEXT: .LBB24_6: ; %Flow2
+; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
+; GFX90A-NEXT: s_cbranch_execz .LBB24_8
+; GFX90A-NEXT: ; %bb.7: ; %atomicrmw.shared
+; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
+; GFX90A-NEXT: ds_add_rtn_f32 v3, v0, v2
+; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT: .LBB24_8: ; %atomicrmw.phi
+; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v3
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: flat_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory:
+; GFX908: ; %bb.0:
+; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT: flat_load_dword v3, v[0:1]
+; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: .LBB24_1: ; %atomicrmw.start
+; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX908-NEXT: v_mov_b32_e32 v4, v3
+; GFX908-NEXT: v_add_f32_e32 v3, v4, v2
+; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX908-NEXT: buffer_wbinvl1
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX908-NEXT: s_cbranch_execnz .LBB24_1
+; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT: v_mov_b32_e32 v0, v3
+; GFX908-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: flat_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: flat_load_dword v3, v[0:1]
+; GFX8-NEXT: s_mov_b64 s[4:5], 0
+; GFX8-NEXT: .LBB24_1: ; %atomicrmw.start
+; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v4, v3
+; GFX8-NEXT: v_add_f32_e32 v3, v4, v2
+; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT: buffer_wbinvl1
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX8-NEXT: s_cbranch_execnz .LBB24_1
+; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT: v_mov_b32_e32 v0, v3
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: flat_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: flat_load_dword v3, v[0:1]
+; GFX7-NEXT: s_mov_b64 s[4:5], 0
+; GFX7-NEXT: .LBB24_1: ; %atomicrmw.start
+; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v4, v3
+; GFX7-NEXT: v_add_f32_e32 v3, v4, v2
+; GFX7-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: buffer_wbinvl1
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX7-NEXT: s_cbranch_execnz .LBB24_1
+; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX7-NEXT: v_mov_b32_e32 v0, v3
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+ %result = atomicrmw fadd ptr %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
+ ret float %result
+}
+
+define void @flat_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory(ptr %ptr, float %val) #0 {
+; GFX12-LABEL: flat_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: s_wait_storecnt 0x0
+; GFX12-NEXT: flat_atomic_add_f32 v[0:1], v2
+; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: flat_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory:
+; GFX940: ; %bb.0:
+; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT: buffer_wbl2 sc1
+; GFX940-NEXT: flat_atomic_add_f32 v[0:1], v2
+; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NEXT: buffer_inv sc1
+; GFX940-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: flat_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: flat_atomic_add_f32 v[0:1], v2
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: buffer_gl1_inv
+; GFX11-NEXT: buffer_gl0_inv
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: flat_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: flat_load_dword v4, v[0:1]
+; GFX10-NEXT: s_mov_b32 s4, 0
+; GFX10-NEXT: .LBB25_1: ; %atomicrmw.start
+; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_add_f32_e32 v3, v4, v2
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT: buffer_gl1_inv
+; GFX10-NEXT: buffer_gl0_inv
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX10-NEXT: v_mov_b32_e32 v4, v3
+; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT: s_cbranch_execnz .LBB25_1
+; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: flat_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory:
+; GFX90A: ; %bb.0: ; %atomicrmw.check.shared
+; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: s_mov_b64 s[4:5], src_shared_base
+; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1
+; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
+; GFX90A-NEXT: s_cbranch_execnz .LBB25_3
+; GFX90A-NEXT: ; %bb.1: ; %Flow2
+; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
+; GFX90A-NEXT: s_cbranch_execnz .LBB25_8
+; GFX90A-NEXT: .LBB25_2: ; %atomicrmw.phi
+; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: s_setpc_b64 s[30:31]
+; GFX90A-NEXT: .LBB25_3: ; %atomicrmw.check.private
+; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base
+; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s7, v1
+; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc
+; GFX90A-NEXT: s_xor_b64 s[6:7], exec, s[6:7]
+; GFX90A-NEXT: s_cbranch_execz .LBB25_5
+; GFX90A-NEXT: ; %bb.4: ; %atomicrmw.global
+; GFX90A-NEXT: global_atomic_add_f32 v[0:1], v2, off
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: buffer_wbinvl1
+; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX90A-NEXT: ; implicit-def: $vgpr2
+; GFX90A-NEXT: .LBB25_5: ; %Flow
+; GFX90A-NEXT: s_andn2_saveexec_b64 s[6:7], s[6:7]
+; GFX90A-NEXT: s_cbranch_execz .LBB25_7
+; GFX90A-NEXT: ; %bb.6: ; %atomicrmw.private
+; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
+; GFX90A-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: v_add_f32_e32 v1, v1, v2
+; GFX90A-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
+; GFX90A-NEXT: .LBB25_7: ; %Flow1
+; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX90A-NEXT: ; implicit-def: $vgpr2
+; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
+; GFX90A-NEXT: s_cbranch_execz .LBB25_2
+; GFX90A-NEXT: .LBB25_8: ; %atomicrmw.shared
+; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
+; GFX90A-NEXT: ds_add_f32 v0, v2
+; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: flat_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory:
+; GFX908: ; %bb.0: ; %atomicrmw.check.shared
+; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT: s_mov_b64 s[4:5], src_shared_base
+; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1
+; GFX908-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX908-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
+; GFX908-NEXT: s_cbranch_execnz .LBB25_3
+; GFX908-NEXT: ; %bb.1: ; %Flow2
+; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
+; GFX908-NEXT: s_cbranch_execnz .LBB25_8
+; GFX908-NEXT: .LBB25_2: ; %atomicrmw.phi
+; GFX908-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: s_setpc_b64 s[30:31]
+; GFX908-NEXT: .LBB25_3: ; %atomicrmw.check.private
+; GFX908-NEXT: s_mov_b64 s[6:7], src_private_base
+; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s7, v1
+; GFX908-NEXT: s_and_saveexec_b64 s[6:7], vcc
+; GFX908-NEXT: s_xor_b64 s[6:7], exec, s[6:7]
+; GFX908-NEXT: s_cbranch_execz .LBB25_5
+; GFX908-NEXT: ; %bb.4: ; %atomicrmw.global
+; GFX908-NEXT: global_atomic_add_f32 v[0:1], v2, off
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: buffer_wbinvl1
+; GFX908-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX908-NEXT: ; implicit-def: $vgpr2
+; GFX908-NEXT: .LBB25_5: ; %Flow
+; GFX908-NEXT: s_andn2_saveexec_b64 s[6:7], s[6:7]
+; GFX908-NEXT: s_cbranch_execz .LBB25_7
+; GFX908-NEXT: ; %bb.6: ; %atomicrmw.private
+; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; GFX908-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
+; GFX908-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: v_add_f32_e32 v1, v1, v2
+; GFX908-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
+; GFX908-NEXT: .LBB25_7: ; %Flow1
+; GFX908-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX908-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX908-NEXT: ; implicit-def: $vgpr2
+; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
+; GFX908-NEXT: s_cbranch_execz .LBB25_2
+; GFX908-NEXT: .LBB25_8: ; %atomicrmw.shared
+; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; GFX908-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
+; GFX908-NEXT: ds_add_f32 v0, v2
+; GFX908-NEXT: s_waitcnt lgkmcnt(0)
+; GFX908-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: flat_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: flat_load_dword v4, v[0:1]
+; GFX8-NEXT: s_mov_b64 s[4:5], 0
+; GFX8-NEXT: .LBB25_1: ; %atomicrmw.start
+; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_add_f32_e32 v3, v4, v2
+; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT: buffer_wbinvl1
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT: v_mov_b32_e32 v4, v3
+; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX8-NEXT: s_cbranch_execnz .LBB25_1
+; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: flat_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: flat_load_dword v4, v[0:1]
+; GFX7-NEXT: s_mov_b64 s[4:5], 0
+; GFX7-NEXT: .LBB25_1: ; %atomicrmw.start
+; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_add_f32_e32 v3, v4, v2
+; GFX7-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: buffer_wbinvl1
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT: v_mov_b32_e32 v4, v3
+; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX7-NEXT: s_cbranch_execnz .LBB25_1
+; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+ %unused = atomicrmw fadd ptr %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
+ ret void
+}
+
+define float @flat_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory_amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode(ptr %ptr, float %val) #0 {
+; GFX12-LABEL: flat_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory_amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: s_wait_storecnt 0x0
+; GFX12-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: flat_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory_amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode:
+; GFX940: ; %bb.0:
+; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT: buffer_wbl2 sc1
+; GFX940-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 sc0
+; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NEXT: buffer_inv sc1
+; GFX940-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: flat_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory_amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 glc
+; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT: buffer_gl1_inv
+; GFX11-NEXT: buffer_gl0_inv
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: flat_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory_amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: flat_load_dword v3, v[0:1]
+; GFX10-NEXT: s_mov_b32 s4, 0
+; GFX10-NEXT: .LBB26_1: ; %atomicrmw.start
+; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_mov_b32_e32 v4, v3
+; GFX10-NEXT: v_add_f32_e32 v3, v4, v2
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT: buffer_gl1_inv
+; GFX10-NEXT: buffer_gl0_inv
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT: s_cbranch_execnz .LBB26_1
+; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT: v_mov_b32_e32 v0, v3
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: flat_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory_amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode:
+; GFX90A: ; %bb.0:
+; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: s_mov_b64 s[4:5], src_shared_base
+; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1
+; GFX90A-NEXT: ; implicit-def: $vgpr3
+; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
+; GFX90A-NEXT: s_cbranch_execz .LBB26_6
+; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.check.private
+; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base
+; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s7, v1
+; GFX90A-NEXT: ; implicit-def: $vgpr3
+; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc
+; GFX90A-NEXT: s_xor_b64 s[6:7], exec, s[6:7]
+; GFX90A-NEXT: s_cbranch_execz .LBB26_3
+; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.global
+; GFX90A-NEXT: global_atomic_add_f32 v3, v[0:1], v2, off glc
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: buffer_wbinvl1
+; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX90A-NEXT: ; implicit-def: $vgpr2
+; GFX90A-NEXT: .LBB26_3: ; %Flow
+; GFX90A-NEXT: s_andn2_saveexec_b64 s[6:7], s[6:7]
+; GFX90A-NEXT: s_cbranch_execz .LBB26_5
+; GFX90A-NEXT: ; %bb.4: ; %atomicrmw.private
+; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
+; GFX90A-NEXT: buffer_load_dword v3, v0, s[0:3], 0 offen
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: v_add_f32_e32 v1, v3, v2
+; GFX90A-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
+; GFX90A-NEXT: .LBB26_5: ; %Flow1
+; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX90A-NEXT: ; implicit-def: $vgpr2
+; GFX90A-NEXT: .LBB26_6: ; %Flow2
+; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
+; GFX90A-NEXT: s_cbranch_execz .LBB26_8
+; GFX90A-NEXT: ; %bb.7: ; %atomicrmw.shared
+; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
+; GFX90A-NEXT: ds_add_rtn_f32 v3, v0, v2
+; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT: .LBB26_8: ; %atomicrmw.phi
+; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v3
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: flat_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory_amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode:
+; GFX908: ; %bb.0:
+; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT: flat_load_dword v3, v[0:1]
+; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: .LBB26_1: ; %atomicrmw.start
+; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX908-NEXT: v_mov_b32_e32 v4, v3
+; GFX908-NEXT: v_add_f32_e32 v3, v4, v2
+; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX908-NEXT: buffer_wbinvl1
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX908-NEXT: s_cbranch_execnz .LBB26_1
+; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT: v_mov_b32_e32 v0, v3
+; GFX908-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: flat_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory_amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: flat_load_dword v3, v[0:1]
+; GFX8-NEXT: s_mov_b64 s[4:5], 0
+; GFX8-NEXT: .LBB26_1: ; %atomicrmw.start
+; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v4, v3
+; GFX8-NEXT: v_add_f32_e32 v3, v4, v2
+; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT: buffer_wbinvl1
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX8-NEXT: s_cbranch_execnz .LBB26_1
+; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT: v_mov_b32_e32 v0, v3
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: flat_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory_amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: flat_load_dword v3, v[0:1]
+; GFX7-NEXT: s_mov_b64 s[4:5], 0
+; GFX7-NEXT: .LBB26_1: ; %atomicrmw.start
+; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v4, v3
+; GFX7-NEXT: v_add_f32_e32 v3, v4, v2
+; GFX7-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: buffer_wbinvl1
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX7-NEXT: s_cbranch_execnz .LBB26_1
+; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX7-NEXT: v_mov_b32_e32 v0, v3
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+ %result = atomicrmw fadd ptr %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0
+ ret float %result
+}
+
+define void @flat_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory_amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode(ptr %ptr, float %val) #0 {
+; GFX12-LABEL: flat_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory_amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: s_wait_storecnt 0x0
+; GFX12-NEXT: flat_atomic_add_f32 v[0:1], v2
+; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: flat_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory_amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode:
+; GFX940: ; %bb.0:
+; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT: buffer_wbl2 sc1
+; GFX940-NEXT: flat_atomic_add_f32 v[0:1], v2
+; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NEXT: buffer_inv sc1
+; GFX940-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: flat_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory_amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: flat_atomic_add_f32 v[0:1], v2
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: buffer_gl1_inv
+; GFX11-NEXT: buffer_gl0_inv
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: flat_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory_amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: flat_load_dword v4, v[0:1]
+; GFX10-NEXT: s_mov_b32 s4, 0
+; GFX10-NEXT: .LBB27_1: ; %atomicrmw.start
+; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_add_f32_e32 v3, v4, v2
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT: buffer_gl1_inv
+; GFX10-NEXT: buffer_gl0_inv
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX10-NEXT: v_mov_b32_e32 v4, v3
+; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT: s_cbranch_execnz .LBB27_1
+; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: flat_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory_amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode:
+; GFX90A: ; %bb.0: ; %atomicrmw.check.shared
+; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: s_mov_b64 s[4:5], src_shared_base
+; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1
+; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
+; GFX90A-NEXT: s_cbranch_execnz .LBB27_3
+; GFX90A-NEXT: ; %bb.1: ; %Flow2
+; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
+; GFX90A-NEXT: s_cbranch_execnz .LBB27_8
+; GFX90A-NEXT: .LBB27_2: ; %atomicrmw.phi
+; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: s_setpc_b64 s[30:31]
+; GFX90A-NEXT: .LBB27_3: ; %atomicrmw.check.private
+; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base
+; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s7, v1
+; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc
+; GFX90A-NEXT: s_xor_b64 s[6:7], exec, s[6:7]
+; GFX90A-NEXT: s_cbranch_execz .LBB27_5
+; GFX90A-NEXT: ; %bb.4: ; %atomicrmw.global
+; GFX90A-NEXT: global_atomic_add_f32 v[0:1], v2, off
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: buffer_wbinvl1
+; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX90A-NEXT: ; implicit-def: $vgpr2
+; GFX90A-NEXT: .LBB27_5: ; %Flow
+; GFX90A-NEXT: s_andn2_saveexec_b64 s[6:7], s[6:7]
+; GFX90A-NEXT: s_cbranch_execz .LBB27_7
+; GFX90A-NEXT: ; %bb.6: ; %atomicrmw.private
+; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
+; GFX90A-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: v_add_f32_e32 v1, v1, v2
+; GFX90A-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
+; GFX90A-NEXT: .LBB27_7: ; %Flow1
+; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX90A-NEXT: ; implicit-def: $vgpr2
+; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
+; GFX90A-NEXT: s_cbranch_execz .LBB27_2
+; GFX90A-NEXT: .LBB27_8: ; %atomicrmw.shared
+; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
+; GFX90A-NEXT: ds_add_f32 v0, v2
+; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: flat_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory_amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode:
+; GFX908: ; %bb.0: ; %atomicrmw.check.shared
+; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT: s_mov_b64 s[4:5], src_shared_base
+; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1
+; GFX908-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX908-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
+; GFX908-NEXT: s_cbranch_execnz .LBB27_3
+; GFX908-NEXT: ; %bb.1: ; %Flow2
+; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
+; GFX908-NEXT: s_cbranch_execnz .LBB27_8
+; GFX908-NEXT: .LBB27_2: ; %atomicrmw.phi
+; GFX908-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: s_setpc_b64 s[30:31]
+; GFX908-NEXT: .LBB27_3: ; %atomicrmw.check.private
+; GFX908-NEXT: s_mov_b64 s[6:7], src_private_base
+; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s7, v1
+; GFX908-NEXT: s_and_saveexec_b64 s[6:7], vcc
+; GFX908-NEXT: s_xor_b64 s[6:7], exec, s[6:7]
+; GFX908-NEXT: s_cbranch_execz .LBB27_5
+; GFX908-NEXT: ; %bb.4: ; %atomicrmw.global
+; GFX908-NEXT: global_atomic_add_f32 v[0:1], v2, off
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: buffer_wbinvl1
+; GFX908-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX908-NEXT: ; implicit-def: $vgpr2
+; GFX908-NEXT: .LBB27_5: ; %Flow
+; GFX908-NEXT: s_andn2_saveexec_b64 s[6:7], s[6:7]
+; GFX908-NEXT: s_cbranch_execz .LBB27_7
+; GFX908-NEXT: ; %bb.6: ; %atomicrmw.private
+; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; GFX908-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
+; GFX908-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: v_add_f32_e32 v1, v1, v2
+; GFX908-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
+; GFX908-NEXT: .LBB27_7: ; %Flow1
+; GFX908-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX908-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX908-NEXT: ; implicit-def: $vgpr2
+; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
+; GFX908-NEXT: s_cbranch_execz .LBB27_2
+; GFX908-NEXT: .LBB27_8: ; %atomicrmw.shared
+; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; GFX908-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
+; GFX908-NEXT: ds_add_f32 v0, v2
+; GFX908-NEXT: s_waitcnt lgkmcnt(0)
+; GFX908-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: flat_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory_amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: flat_load_dword v4, v[0:1]
+; GFX8-NEXT: s_mov_b64 s[4:5], 0
+; GFX8-NEXT: .LBB27_1: ; %atomicrmw.start
+; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_add_f32_e32 v3, v4, v2
+; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT: buffer_wbinvl1
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT: v_mov_b32_e32 v4, v3
+; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX8-NEXT: s_cbranch_execnz .LBB27_1
+; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: flat_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory_amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: flat_load_dword v4, v[0:1]
+; GFX7-NEXT: s_mov_b64 s[4:5], 0
+; GFX7-NEXT: .LBB27_1: ; %atomicrmw.start
+; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_add_f32_e32 v3, v4, v2
+; GFX7-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: buffer_wbinvl1
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT: v_mov_b32_e32 v4, v3
+; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX7-NEXT: s_cbranch_execnz .LBB27_1
+; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+ %result = atomicrmw fadd ptr %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0
+ ret void
+}
+
+define float @flat_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory_amdgpu_no_remote_memory(ptr %ptr, float %val) #0 {
+; GFX12-LABEL: flat_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory_amdgpu_no_remote_memory:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: s_wait_storecnt 0x0
+; GFX12-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: flat_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory_amdgpu_no_remote_memory:
+; GFX940: ; %bb.0:
+; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT: buffer_wbl2 sc1
+; GFX940-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 sc0
+; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NEXT: buffer_inv sc1
+; GFX940-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: flat_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory_amdgpu_no_remote_memory:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 glc
+; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT: buffer_gl1_inv
+; GFX11-NEXT: buffer_gl0_inv
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: flat_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory_amdgpu_no_remote_memory:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: flat_load_dword v3, v[0:1]
+; GFX10-NEXT: s_mov_b32 s4, 0
+; GFX10-NEXT: .LBB28_1: ; %atomicrmw.start
+; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_mov_b32_e32 v4, v3
+; GFX10-NEXT: v_add_f32_e32 v3, v4, v2
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT: buffer_gl1_inv
+; GFX10-NEXT: buffer_gl0_inv
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT: s_cbranch_execnz .LBB28_1
+; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT: v_mov_b32_e32 v0, v3
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: flat_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory_amdgpu_no_remote_memory:
+; GFX90A: ; %bb.0:
+; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: s_mov_b64 s[4:5], src_shared_base
+; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1
+; GFX90A-NEXT: ; implicit-def: $vgpr3
+; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
+; GFX90A-NEXT: s_cbranch_execz .LBB28_6
+; GFX90A-NEXT: ; %bb.1: ; %atomicrmw.check.private
+; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base
+; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s7, v1
+; GFX90A-NEXT: ; implicit-def: $vgpr3
+; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc
+; GFX90A-NEXT: s_xor_b64 s[6:7], exec, s[6:7]
+; GFX90A-NEXT: s_cbranch_execz .LBB28_3
+; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.global
+; GFX90A-NEXT: global_atomic_add_f32 v3, v[0:1], v2, off glc
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: buffer_wbinvl1
+; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX90A-NEXT: ; implicit-def: $vgpr2
+; GFX90A-NEXT: .LBB28_3: ; %Flow
+; GFX90A-NEXT: s_andn2_saveexec_b64 s[6:7], s[6:7]
+; GFX90A-NEXT: s_cbranch_execz .LBB28_5
+; GFX90A-NEXT: ; %bb.4: ; %atomicrmw.private
+; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
+; GFX90A-NEXT: buffer_load_dword v3, v0, s[0:3], 0 offen
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: v_add_f32_e32 v1, v3, v2
+; GFX90A-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
+; GFX90A-NEXT: .LBB28_5: ; %Flow1
+; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX90A-NEXT: ; implicit-def: $vgpr2
+; GFX90A-NEXT: .LBB28_6: ; %Flow2
+; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
+; GFX90A-NEXT: s_cbranch_execz .LBB28_8
+; GFX90A-NEXT: ; %bb.7: ; %atomicrmw.shared
+; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
+; GFX90A-NEXT: ds_add_rtn_f32 v3, v0, v2
+; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT: .LBB28_8: ; %atomicrmw.phi
+; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v3
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: flat_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory_amdgpu_no_remote_memory:
+; GFX908: ; %bb.0:
+; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT: flat_load_dword v3, v[0:1]
+; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: .LBB28_1: ; %atomicrmw.start
+; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX908-NEXT: v_mov_b32_e32 v4, v3
+; GFX908-NEXT: v_add_f32_e32 v3, v4, v2
+; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX908-NEXT: buffer_wbinvl1
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX908-NEXT: s_cbranch_execnz .LBB28_1
+; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT: v_mov_b32_e32 v0, v3
+; GFX908-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: flat_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory_amdgpu_no_remote_memory:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: flat_load_dword v3, v[0:1]
+; GFX8-NEXT: s_mov_b64 s[4:5], 0
+; GFX8-NEXT: .LBB28_1: ; %atomicrmw.start
+; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v4, v3
+; GFX8-NEXT: v_add_f32_e32 v3, v4, v2
+; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT: buffer_wbinvl1
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX8-NEXT: s_cbranch_execnz .LBB28_1
+; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT: v_mov_b32_e32 v0, v3
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: flat_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory_amdgpu_no_remote_memory:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: flat_load_dword v3, v[0:1]
+; GFX7-NEXT: s_mov_b64 s[4:5], 0
+; GFX7-NEXT: .LBB28_1: ; %atomicrmw.start
+; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v4, v3
+; GFX7-NEXT: v_add_f32_e32 v3, v4, v2
+; GFX7-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: buffer_wbinvl1
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX7-NEXT: s_cbranch_execnz .LBB28_1
+; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX7-NEXT: v_mov_b32_e32 v0, v3
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+ %result = atomicrmw fadd ptr %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
+ ret float %result
+}
+
+define void @flat_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory_amdgpu_no_remote_memory(ptr %ptr, float %val) #0 {
+; GFX12-LABEL: flat_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory_amdgpu_no_remote_memory:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: s_wait_storecnt 0x0
+; GFX12-NEXT: flat_atomic_add_f32 v[0:1], v2
+; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: flat_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory_amdgpu_no_remote_memory:
+; GFX940: ; %bb.0:
+; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT: buffer_wbl2 sc1
+; GFX940-NEXT: flat_atomic_add_f32 v[0:1], v2
+; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NEXT: buffer_inv sc1
+; GFX940-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: flat_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory_amdgpu_no_remote_memory:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: flat_atomic_add_f32 v[0:1], v2
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: buffer_gl1_inv
+; GFX11-NEXT: buffer_gl0_inv
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: flat_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory_amdgpu_no_remote_memory:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: flat_load_dword v4, v[0:1]
+; GFX10-NEXT: s_mov_b32 s4, 0
+; GFX10-NEXT: .LBB29_1: ; %atomicrmw.start
+; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_add_f32_e32 v3, v4, v2
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT: buffer_gl1_inv
+; GFX10-NEXT: buffer_gl0_inv
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX10-NEXT: v_mov_b32_e32 v4, v3
+; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT: s_cbranch_execnz .LBB29_1
+; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: flat_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory_amdgpu_no_remote_memory:
+; GFX90A: ; %bb.0: ; %atomicrmw.check.shared
+; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: s_mov_b64 s[4:5], src_shared_base
+; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1
+; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX90A-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
+; GFX90A-NEXT: s_cbranch_execnz .LBB29_3
+; GFX90A-NEXT: ; %bb.1: ; %Flow2
+; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
+; GFX90A-NEXT: s_cbranch_execnz .LBB29_8
+; GFX90A-NEXT: .LBB29_2: ; %atomicrmw.phi
+; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: s_setpc_b64 s[30:31]
+; GFX90A-NEXT: .LBB29_3: ; %atomicrmw.check.private
+; GFX90A-NEXT: s_mov_b64 s[6:7], src_private_base
+; GFX90A-NEXT: v_cmp_ne_u32_e32 vcc, s7, v1
+; GFX90A-NEXT: s_and_saveexec_b64 s[6:7], vcc
+; GFX90A-NEXT: s_xor_b64 s[6:7], exec, s[6:7]
+; GFX90A-NEXT: s_cbranch_execz .LBB29_5
+; GFX90A-NEXT: ; %bb.4: ; %atomicrmw.global
+; GFX90A-NEXT: global_atomic_add_f32 v[0:1], v2, off
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: buffer_wbinvl1
+; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX90A-NEXT: ; implicit-def: $vgpr2
+; GFX90A-NEXT: .LBB29_5: ; %Flow
+; GFX90A-NEXT: s_andn2_saveexec_b64 s[6:7], s[6:7]
+; GFX90A-NEXT: s_cbranch_execz .LBB29_7
+; GFX90A-NEXT: ; %bb.6: ; %atomicrmw.private
+; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
+; GFX90A-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: v_add_f32_e32 v1, v1, v2
+; GFX90A-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
+; GFX90A-NEXT: .LBB29_7: ; %Flow1
+; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX90A-NEXT: ; implicit-def: $vgpr2
+; GFX90A-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
+; GFX90A-NEXT: s_cbranch_execz .LBB29_2
+; GFX90A-NEXT: .LBB29_8: ; %atomicrmw.shared
+; GFX90A-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; GFX90A-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
+; GFX90A-NEXT: ds_add_f32 v0, v2
+; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: flat_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory_amdgpu_no_remote_memory:
+; GFX908: ; %bb.0: ; %atomicrmw.check.shared
+; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT: s_mov_b64 s[4:5], src_shared_base
+; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s5, v1
+; GFX908-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX908-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
+; GFX908-NEXT: s_cbranch_execnz .LBB29_3
+; GFX908-NEXT: ; %bb.1: ; %Flow2
+; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
+; GFX908-NEXT: s_cbranch_execnz .LBB29_8
+; GFX908-NEXT: .LBB29_2: ; %atomicrmw.phi
+; GFX908-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: s_setpc_b64 s[30:31]
+; GFX908-NEXT: .LBB29_3: ; %atomicrmw.check.private
+; GFX908-NEXT: s_mov_b64 s[6:7], src_private_base
+; GFX908-NEXT: v_cmp_ne_u32_e32 vcc, s7, v1
+; GFX908-NEXT: s_and_saveexec_b64 s[6:7], vcc
+; GFX908-NEXT: s_xor_b64 s[6:7], exec, s[6:7]
+; GFX908-NEXT: s_cbranch_execz .LBB29_5
+; GFX908-NEXT: ; %bb.4: ; %atomicrmw.global
+; GFX908-NEXT: global_atomic_add_f32 v[0:1], v2, off
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: buffer_wbinvl1
+; GFX908-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX908-NEXT: ; implicit-def: $vgpr2
+; GFX908-NEXT: .LBB29_5: ; %Flow
+; GFX908-NEXT: s_andn2_saveexec_b64 s[6:7], s[6:7]
+; GFX908-NEXT: s_cbranch_execz .LBB29_7
+; GFX908-NEXT: ; %bb.6: ; %atomicrmw.private
+; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; GFX908-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
+; GFX908-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: v_add_f32_e32 v1, v1, v2
+; GFX908-NEXT: buffer_store_dword v1, v0, s[0:3], 0 offen
+; GFX908-NEXT: .LBB29_7: ; %Flow1
+; GFX908-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX908-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX908-NEXT: ; implicit-def: $vgpr2
+; GFX908-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
+; GFX908-NEXT: s_cbranch_execz .LBB29_2
+; GFX908-NEXT: .LBB29_8: ; %atomicrmw.shared
+; GFX908-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; GFX908-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
+; GFX908-NEXT: ds_add_f32 v0, v2
+; GFX908-NEXT: s_waitcnt lgkmcnt(0)
+; GFX908-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: flat_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory_amdgpu_no_remote_memory:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: flat_load_dword v4, v[0:1]
+; GFX8-NEXT: s_mov_b64 s[4:5], 0
+; GFX8-NEXT: .LBB29_1: ; %atomicrmw.start
+; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_add_f32_e32 v3, v4, v2
+; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT: buffer_wbinvl1
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT: v_mov_b32_e32 v4, v3
+; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX8-NEXT: s_cbranch_execnz .LBB29_1
+; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: flat_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory_amdgpu_no_remote_memory:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: flat_load_dword v4, v[0:1]
+; GFX7-NEXT: s_mov_b64 s[4:5], 0
+; GFX7-NEXT: .LBB29_1: ; %atomicrmw.start
+; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_add_f32_e32 v3, v4, v2
+; GFX7-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: buffer_wbinvl1
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT: v_mov_b32_e32 v4, v3
+; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX7-NEXT: s_cbranch_execnz .LBB29_1
+; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+ %result = atomicrmw fadd ptr %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
+ ret void
+}
+
+; --------------------------------------------------------------------
+; double
+; --------------------------------------------------------------------
+
+define double @flat_agent_atomic_fadd_ret_f64__amdgpu_no_fine_grained_memory(ptr %ptr, double %val) #0 {
+; GFX12-LABEL: flat_agent_atomic_fadd_ret_f64__amdgpu_no_fine_grained_memory:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: flat_load_b64 v[4:5], v[0:1]
+; GFX12-NEXT: s_mov_b32 s0, 0
+; GFX12-NEXT: .LBB30_1: ; %atomicrmw.start
+; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_add_f64_e32 v[4:5], v[6:7], v[2:3]
+; GFX12-NEXT: s_wait_storecnt 0x0
+; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_cbranch_execnz .LBB30_1
+; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: flat_agent_atomic_fadd_ret_f64__amdgpu_no_fine_grained_memory:
+; GFX940: ; %bb.0:
+; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT: buffer_wbl2 sc1
+; GFX940-NEXT: flat_atomic_add_f64 v[0:1], v[0:1], v[2:3] sc0
+; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NEXT: buffer_inv sc1
+; GFX940-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: flat_agent_atomic_fadd_ret_f64__amdgpu_no_fine_grained_memory:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: flat_load_b64 v[4:5], v[0:1]
+; GFX11-NEXT: s_mov_b32 s0, 0
+; GFX11-NEXT: .LBB30_1: ; %atomicrmw.start
+; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3]
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] glc
+; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT: buffer_gl1_inv
+; GFX11-NEXT: buffer_gl0_inv
+; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT: s_cbranch_execnz .LBB30_1
+; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: flat_agent_atomic_fadd_ret_f64__amdgpu_no_fine_grained_memory:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: flat_load_dwordx2 v[4:5], v[0:1]
+; GFX10-NEXT: s_mov_b32 s4, 0
+; GFX10-NEXT: .LBB30_1: ; %atomicrmw.start
+; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_mov_b32_e32 v7, v5
+; GFX10-NEXT: v_mov_b32_e32 v6, v4
+; GFX10-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3]
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
+; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT: buffer_gl1_inv
+; GFX10-NEXT: buffer_gl0_inv
+; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT: s_cbranch_execnz .LBB30_1
+; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT: v_mov_b32_e32 v0, v4
+; GFX10-NEXT: v_mov_b32_e32 v1, v5
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: flat_agent_atomic_fadd_ret_f64__amdgpu_no_fine_grained_memory:
+; GFX90A: ; %bb.0:
+; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: flat_atomic_add_f64 v[0:1], v[0:1], v[2:3] glc
+; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: buffer_wbinvl1
+; GFX90A-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: flat_agent_atomic_fadd_ret_f64__amdgpu_no_fine_grained_memory:
+; GFX908: ; %bb.0:
+; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT: flat_load_dwordx2 v[4:5], v[0:1]
+; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: .LBB30_1: ; %atomicrmw.start
+; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX908-NEXT: v_mov_b32_e32 v7, v5
+; GFX908-NEXT: v_mov_b32_e32 v6, v4
+; GFX908-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3]
+; GFX908-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
+; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX908-NEXT: buffer_wbinvl1
+; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX908-NEXT: s_cbranch_execnz .LBB30_1
+; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT: v_mov_b32_e32 v0, v4
+; GFX908-NEXT: v_mov_b32_e32 v1, v5
+; GFX908-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: flat_agent_atomic_fadd_ret_f64__amdgpu_no_fine_grained_memory:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_add_u32_e32 v5, vcc, 4, v0
+; GFX8-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc
+; GFX8-NEXT: flat_load_dword v4, v[0:1]
+; GFX8-NEXT: flat_load_dword v5, v[5:6]
+; GFX8-NEXT: s_mov_b64 s[4:5], 0
+; GFX8-NEXT: .LBB30_1: ; %atomicrmw.start
+; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v7, v5
+; GFX8-NEXT: v_mov_b32_e32 v6, v4
+; GFX8-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3]
+; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
+; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT: buffer_wbinvl1
+; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX8-NEXT: s_cbranch_execnz .LBB30_1
+; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT: v_mov_b32_e32 v0, v4
+; GFX8-NEXT: v_mov_b32_e32 v1, v5
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: flat_agent_atomic_fadd_ret_f64__amdgpu_no_fine_grained_memory:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_add_i32_e32 v5, vcc, 4, v0
+; GFX7-NEXT: v_addc_u32_e32 v6, vcc, 0, v1, vcc
+; GFX7-NEXT: flat_load_dword v4, v[0:1]
+; GFX7-NEXT: flat_load_dword v5, v[5:6]
+; GFX7-NEXT: s_mov_b64 s[4:5], 0
+; GFX7-NEXT: .LBB30_1: ; %atomicrmw.start
+; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v7, v5
+; GFX7-NEXT: v_mov_b32_e32 v6, v4
+; GFX7-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3]
+; GFX7-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
+; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: buffer_wbinvl1
+; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX7-NEXT: s_cbranch_execnz .LBB30_1
+; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX7-NEXT: v_mov_b32_e32 v0, v4
+; GFX7-NEXT: v_mov_b32_e32 v1, v5
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+ %result = atomicrmw fadd ptr %ptr, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+ ret double %result
+}
+
+define double @flat_agent_atomic_fadd_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory(ptr %ptr, double %val) #0 {
+; GFX12-LABEL: flat_agent_atomic_fadd_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: flat_load_b64 v[4:5], v[0:1] offset:2040
+; GFX12-NEXT: s_mov_b32 s0, 0
+; GFX12-NEXT: .LBB31_1: ; %atomicrmw.start
+; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_add_f64_e32 v[4:5], v[6:7], v[2:3]
+; GFX12-NEXT: s_wait_storecnt 0x0
+; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] offset:2040 th:TH_ATOMIC_RETURN
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_cbranch_execnz .LBB31_1
+; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: flat_agent_atomic_fadd_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX940: ; %bb.0:
+; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT: buffer_wbl2 sc1
+; GFX940-NEXT: flat_atomic_add_f64 v[0:1], v[0:1], v[2:3] offset:2040 sc0
+; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NEXT: buffer_inv sc1
+; GFX940-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: flat_agent_atomic_fadd_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: flat_load_b64 v[4:5], v[0:1] offset:2040
+; GFX11-NEXT: s_mov_b32 s0, 0
+; GFX11-NEXT: .LBB31_1: ; %atomicrmw.start
+; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3]
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] offset:2040 glc
+; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT: buffer_gl1_inv
+; GFX11-NEXT: buffer_gl0_inv
+; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT: s_cbranch_execnz .LBB31_1
+; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: flat_agent_atomic_fadd_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, 0x7f8, v0
+; GFX10-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo
+; GFX10-NEXT: s_mov_b32 s4, 0
+; GFX10-NEXT: flat_load_dwordx2 v[0:1], v[4:5]
+; GFX10-NEXT: .LBB31_1: ; %atomicrmw.start
+; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_mov_b32_e32 v9, v1
+; GFX10-NEXT: v_mov_b32_e32 v8, v0
+; GFX10-NEXT: v_add_f64 v[6:7], v[8:9], v[2:3]
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
+; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT: buffer_gl1_inv
+; GFX10-NEXT: buffer_gl0_inv
+; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9]
+; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT: s_cbranch_execnz .LBB31_1
+; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: flat_agent_atomic_fadd_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX90A: ; %bb.0:
+; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: flat_atomic_add_f64 v[0:1], v[0:1], v[2:3] offset:2040 glc
+; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: buffer_wbinvl1
+; GFX90A-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: flat_agent_atomic_fadd_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX908: ; %bb.0:
+; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT: flat_load_dwordx2 v[4:5], v[0:1] offset:2040
+; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: .LBB31_1: ; %atomicrmw.start
+; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX908-NEXT: v_mov_b32_e32 v7, v5
+; GFX908-NEXT: v_mov_b32_e32 v6, v4
+; GFX908-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3]
+; GFX908-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:2040 glc
+; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX908-NEXT: buffer_wbinvl1
+; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX908-NEXT: s_cbranch_execnz .LBB31_1
+; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT: v_mov_b32_e32 v0, v4
+; GFX908-NEXT: v_mov_b32_e32 v1, v5
+; GFX908-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: flat_agent_atomic_fadd_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7f8, v0
+; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT: flat_load_dword v1, v[0:1]
+; GFX8-NEXT: flat_load_dword v0, v[4:5]
+; GFX8-NEXT: s_mov_b64 s[4:5], 0
+; GFX8-NEXT: .LBB31_1: ; %atomicrmw.start
+; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v9, v1
+; GFX8-NEXT: v_mov_b32_e32 v8, v0
+; GFX8-NEXT: v_add_f64 v[6:7], v[8:9], v[2:3]
+; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
+; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT: buffer_wbinvl1
+; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX8-NEXT: s_cbranch_execnz .LBB31_1
+; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: flat_agent_atomic_fadd_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x7f8, v0
+; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
+; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7fc, v0
+; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX7-NEXT: flat_load_dword v1, v[0:1]
+; GFX7-NEXT: flat_load_dword v0, v[4:5]
+; GFX7-NEXT: s_mov_b64 s[4:5], 0
+; GFX7-NEXT: .LBB31_1: ; %atomicrmw.start
+; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v9, v1
+; GFX7-NEXT: v_mov_b32_e32 v8, v0
+; GFX7-NEXT: v_add_f64 v[6:7], v[8:9], v[2:3]
+; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
+; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: buffer_wbinvl1
+; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX7-NEXT: s_cbranch_execnz .LBB31_1
+; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+ %gep = getelementptr double, ptr %ptr, i64 255
+ %result = atomicrmw fadd ptr %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+ ret double %result
+}
+
+define double @flat_agent_atomic_fadd_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory(ptr %ptr, double %val) #0 {
+; GFX12-LABEL: flat_agent_atomic_fadd_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: flat_load_b64 v[4:5], v[0:1] offset:-2048
+; GFX12-NEXT: s_mov_b32 s0, 0
+; GFX12-NEXT: .LBB32_1: ; %atomicrmw.start
+; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_add_f64_e32 v[4:5], v[6:7], v[2:3]
+; GFX12-NEXT: s_wait_storecnt 0x0
+; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] offset:-2048 th:TH_ATOMIC_RETURN
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_cbranch_execnz .LBB32_1
+; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: flat_agent_atomic_fadd_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX940: ; %bb.0:
+; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
+; GFX940-NEXT: s_nop 1
+; GFX940-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX940-NEXT: buffer_wbl2 sc1
+; GFX940-NEXT: flat_atomic_add_f64 v[0:1], v[0:1], v[2:3] sc0
+; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NEXT: buffer_inv sc1
+; GFX940-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: flat_agent_atomic_fadd_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v0
+; GFX11-NEXT: s_mov_b32 s0, 0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v5
+; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v4, vcc_lo
+; GFX11-NEXT: v_add_co_u32 v5, vcc_lo, 0xfffff800, v5
+; GFX11-NEXT: v_add_co_ci_u32_e32 v6, vcc_lo, -1, v4, vcc_lo
+; GFX11-NEXT: flat_load_b64 v[0:1], v[0:1]
+; GFX11-NEXT: .LBB32_1: ; %atomicrmw.start
+; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_add_f64 v[7:8], v[9:10], v[2:3]
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: flat_atomic_cmpswap_b64 v[0:1], v[5:6], v[7:10] glc
+; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT: buffer_gl1_inv
+; GFX11-NEXT: buffer_gl0_inv
+; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10]
+; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT: s_cbranch_execnz .LBB32_1
+; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: flat_agent_atomic_fadd_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
+; GFX10-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, -1, v1, vcc_lo
+; GFX10-NEXT: s_mov_b32 s4, 0
+; GFX10-NEXT: flat_load_dwordx2 v[0:1], v[4:5]
+; GFX10-NEXT: .LBB32_1: ; %atomicrmw.start
+; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_mov_b32_e32 v9, v1
+; GFX10-NEXT: v_mov_b32_e32 v8, v0
+; GFX10-NEXT: v_add_f64 v[6:7], v[8:9], v[2:3]
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
+; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT: buffer_gl1_inv
+; GFX10-NEXT: buffer_gl0_inv
+; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[8:9]
+; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT: s_cbranch_execnz .LBB32_1
+; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: flat_agent_atomic_fadd_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX90A: ; %bb.0:
+; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
+; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX90A-NEXT: flat_atomic_add_f64 v[0:1], v[0:1], v[2:3] glc
+; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: buffer_wbinvl1
+; GFX90A-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: flat_agent_atomic_fadd_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX908: ; %bb.0:
+; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
+; GFX908-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc
+; GFX908-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
+; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX908-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
+; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: .LBB32_1: ; %atomicrmw.start
+; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX908-NEXT: v_mov_b32_e32 v9, v1
+; GFX908-NEXT: v_mov_b32_e32 v8, v0
+; GFX908-NEXT: v_add_f64 v[6:7], v[8:9], v[2:3]
+; GFX908-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
+; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX908-NEXT: buffer_wbinvl1
+; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX908-NEXT: s_cbranch_execnz .LBB32_1
+; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: flat_agent_atomic_fadd_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0xfffff800, v0
+; GFX8-NEXT: v_addc_u32_e32 v5, vcc, -1, v1, vcc
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0xfffff804, v0
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc
+; GFX8-NEXT: flat_load_dword v1, v[0:1]
+; GFX8-NEXT: flat_load_dword v0, v[4:5]
+; GFX8-NEXT: s_mov_b64 s[4:5], 0
+; GFX8-NEXT: .LBB32_1: ; %atomicrmw.start
+; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v9, v1
+; GFX8-NEXT: v_mov_b32_e32 v8, v0
+; GFX8-NEXT: v_add_f64 v[6:7], v[8:9], v[2:3]
+; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
+; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT: buffer_wbinvl1
+; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX8-NEXT: s_cbranch_execnz .LBB32_1
+; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: flat_agent_atomic_fadd_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0xfffff800, v0
+; GFX7-NEXT: v_addc_u32_e32 v5, vcc, -1, v1, vcc
+; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0xfffff804, v0
+; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc
+; GFX7-NEXT: flat_load_dword v1, v[0:1]
+; GFX7-NEXT: flat_load_dword v0, v[4:5]
+; GFX7-NEXT: s_mov_b64 s[4:5], 0
+; GFX7-NEXT: .LBB32_1: ; %atomicrmw.start
+; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v9, v1
+; GFX7-NEXT: v_mov_b32_e32 v8, v0
+; GFX7-NEXT: v_add_f64 v[6:7], v[8:9], v[2:3]
+; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
+; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: buffer_wbinvl1
+; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX7-NEXT: s_cbranch_execnz .LBB32_1
+; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+ %gep = getelementptr double, ptr %ptr, i64 -256
+ %result = atomicrmw fadd ptr %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+ ret double %result
+}
+
+define void @flat_agent_atomic_fadd_noret_f64__amdgpu_no_fine_grained_memory(ptr %ptr, double %val) #0 {
+; GFX12-LABEL: flat_agent_atomic_fadd_noret_f64__amdgpu_no_fine_grained_memory:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: flat_load_b64 v[6:7], v[0:1]
+; GFX12-NEXT: s_mov_b32 s0, 0
+; GFX12-NEXT: .LBB33_1: ; %atomicrmw.start
+; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: v_add_f64_e32 v[4:5], v[6:7], v[2:3]
+; GFX12-NEXT: s_wait_storecnt 0x0
+; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
+; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_cbranch_execnz .LBB33_1
+; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: flat_agent_atomic_fadd_noret_f64__amdgpu_no_fine_grained_memory:
+; GFX940: ; %bb.0:
+; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT: buffer_wbl2 sc1
+; GFX940-NEXT: flat_atomic_add_f64 v[0:1], v[2:3]
+; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NEXT: buffer_inv sc1
+; GFX940-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: flat_agent_atomic_fadd_noret_f64__amdgpu_no_fine_grained_memory:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: flat_load_b64 v[6:7], v[0:1]
+; GFX11-NEXT: s_mov_b32 s0, 0
+; GFX11-NEXT: .LBB33_1: ; %atomicrmw.start
+; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3]
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] glc
+; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT: buffer_gl1_inv
+; GFX11-NEXT: buffer_gl0_inv
+; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
+; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT: s_cbranch_execnz .LBB33_1
+; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: flat_agent_atomic_fadd_noret_f64__amdgpu_no_fine_grained_memory:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: flat_load_dwordx2 v[6:7], v[0:1]
+; GFX10-NEXT: s_mov_b32 s4, 0
+; GFX10-NEXT: .LBB33_1: ; %atomicrmw.start
+; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3]
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
+; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT: buffer_gl1_inv
+; GFX10-NEXT: buffer_gl0_inv
+; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX10-NEXT: v_mov_b32_e32 v7, v5
+; GFX10-NEXT: v_mov_b32_e32 v6, v4
+; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT: s_cbranch_execnz .LBB33_1
+; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: flat_agent_atomic_fadd_noret_f64__amdgpu_no_fine_grained_memory:
+; GFX90A: ; %bb.0:
+; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: flat_atomic_add_f64 v[0:1], v[2:3]
+; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: buffer_wbinvl1
+; GFX90A-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: flat_agent_atomic_fadd_noret_f64__amdgpu_no_fine_grained_memory:
+; GFX908: ; %bb.0:
+; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT: flat_load_dwordx2 v[6:7], v[0:1]
+; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: .LBB33_1: ; %atomicrmw.start
+; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX908-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3]
+; GFX908-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
+; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX908-NEXT: buffer_wbinvl1
+; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX908-NEXT: v_mov_b32_e32 v7, v5
+; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT: v_mov_b32_e32 v6, v4
+; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX908-NEXT: s_cbranch_execnz .LBB33_1
+; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: flat_agent_atomic_fadd_noret_f64__amdgpu_no_fine_grained_memory:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_add_u32_e32 v4, vcc, 4, v0
+; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
+; GFX8-NEXT: flat_load_dword v6, v[0:1]
+; GFX8-NEXT: flat_load_dword v7, v[4:5]
+; GFX8-NEXT: s_mov_b64 s[4:5], 0
+; GFX8-NEXT: .LBB33_1: ; %atomicrmw.start
+; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3]
+; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
+; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT: buffer_wbinvl1
+; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX8-NEXT: v_mov_b32_e32 v7, v5
+; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT: v_mov_b32_e32 v6, v4
+; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX8-NEXT: s_cbranch_execnz .LBB33_1
+; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: flat_agent_atomic_fadd_noret_f64__amdgpu_no_fine_grained_memory:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_add_i32_e32 v4, vcc, 4, v0
+; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
+; GFX7-NEXT: flat_load_dword v6, v[0:1]
+; GFX7-NEXT: flat_load_dword v7, v[4:5]
+; GFX7-NEXT: s_mov_b64 s[4:5], 0
+; GFX7-NEXT: .LBB33_1: ; %atomicrmw.start
+; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3]
+; GFX7-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
+; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: buffer_wbinvl1
+; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v7, v5
+; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT: v_mov_b32_e32 v6, v4
+; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX7-NEXT: s_cbranch_execnz .LBB33_1
+; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+ %unused = atomicrmw fadd ptr %ptr, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+ ret void
+}
+
+define void @flat_agent_atomic_fadd_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory(ptr %ptr, double %val) #0 {
+; GFX12-LABEL: flat_agent_atomic_fadd_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: flat_load_b64 v[6:7], v[0:1] offset:2040
+; GFX12-NEXT: s_mov_b32 s0, 0
+; GFX12-NEXT: .LBB34_1: ; %atomicrmw.start
+; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: v_add_f64_e32 v[4:5], v[6:7], v[2:3]
+; GFX12-NEXT: s_wait_storecnt 0x0
+; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] offset:2040 th:TH_ATOMIC_RETURN
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
+; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_cbranch_execnz .LBB34_1
+; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: flat_agent_atomic_fadd_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX940: ; %bb.0:
+; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT: buffer_wbl2 sc1
+; GFX940-NEXT: flat_atomic_add_f64 v[0:1], v[2:3] offset:2040
+; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NEXT: buffer_inv sc1
+; GFX940-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: flat_agent_atomic_fadd_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: flat_load_b64 v[6:7], v[0:1] offset:2040
+; GFX11-NEXT: s_mov_b32 s0, 0
+; GFX11-NEXT: .LBB34_1: ; %atomicrmw.start
+; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3]
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] offset:2040 glc
+; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT: buffer_gl1_inv
+; GFX11-NEXT: buffer_gl0_inv
+; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
+; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT: s_cbranch_execnz .LBB34_1
+; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: flat_agent_atomic_fadd_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x7f8, v0
+; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX10-NEXT: s_mov_b32 s4, 0
+; GFX10-NEXT: flat_load_dwordx2 v[6:7], v[0:1]
+; GFX10-NEXT: .LBB34_1: ; %atomicrmw.start
+; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3]
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
+; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT: buffer_gl1_inv
+; GFX10-NEXT: buffer_gl0_inv
+; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX10-NEXT: v_mov_b32_e32 v7, v5
+; GFX10-NEXT: v_mov_b32_e32 v6, v4
+; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT: s_cbranch_execnz .LBB34_1
+; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: flat_agent_atomic_fadd_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX90A: ; %bb.0:
+; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: flat_atomic_add_f64 v[0:1], v[2:3] offset:2040
+; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: buffer_wbinvl1
+; GFX90A-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: flat_agent_atomic_fadd_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX908: ; %bb.0:
+; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT: flat_load_dwordx2 v[6:7], v[0:1] offset:2040
+; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: .LBB34_1: ; %atomicrmw.start
+; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX908-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3]
+; GFX908-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:2040 glc
+; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX908-NEXT: buffer_wbinvl1
+; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX908-NEXT: v_mov_b32_e32 v7, v5
+; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT: v_mov_b32_e32 v6, v4
+; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX908-NEXT: s_cbranch_execnz .LBB34_1
+; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: flat_agent_atomic_fadd_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7f8, v0
+; GFX8-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT: flat_load_dword v7, v[0:1]
+; GFX8-NEXT: flat_load_dword v6, v[8:9]
+; GFX8-NEXT: s_mov_b64 s[4:5], 0
+; GFX8-NEXT: .LBB34_1: ; %atomicrmw.start
+; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3]
+; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc
+; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT: buffer_wbinvl1
+; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
+; GFX8-NEXT: v_mov_b32_e32 v7, v1
+; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT: v_mov_b32_e32 v6, v0
+; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX8-NEXT: s_cbranch_execnz .LBB34_1
+; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: flat_agent_atomic_fadd_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_add_i32_e32 v8, vcc, 0x7f8, v0
+; GFX7-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc
+; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7fc, v0
+; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX7-NEXT: flat_load_dword v7, v[0:1]
+; GFX7-NEXT: flat_load_dword v6, v[8:9]
+; GFX7-NEXT: s_mov_b64 s[4:5], 0
+; GFX7-NEXT: .LBB34_1: ; %atomicrmw.start
+; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3]
+; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc
+; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: buffer_wbinvl1
+; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v7, v1
+; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT: v_mov_b32_e32 v6, v0
+; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX7-NEXT: s_cbranch_execnz .LBB34_1
+; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+ %gep = getelementptr double, ptr %ptr, i64 255
+ %unused = atomicrmw fadd ptr %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+ ret void
+}
+
+define void @flat_agent_atomic_fadd_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory(ptr %ptr, double %val) #0 {
+; GFX12-LABEL: flat_agent_atomic_fadd_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: flat_load_b64 v[6:7], v[0:1] offset:-2048
+; GFX12-NEXT: s_mov_b32 s0, 0
+; GFX12-NEXT: .LBB35_1: ; %atomicrmw.start
+; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: v_add_f64_e32 v[4:5], v[6:7], v[2:3]
+; GFX12-NEXT: s_wait_storecnt 0x0
+; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] offset:-2048 th:TH_ATOMIC_RETURN
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
+; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_cbranch_execnz .LBB35_1
+; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: flat_agent_atomic_fadd_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX940: ; %bb.0:
+; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
+; GFX940-NEXT: s_nop 1
+; GFX940-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX940-NEXT: buffer_wbl2 sc1
+; GFX940-NEXT: flat_atomic_add_f64 v[0:1], v[2:3]
+; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NEXT: buffer_inv sc1
+; GFX940-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: flat_agent_atomic_fadd_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
+; GFX11-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, -1, v1, vcc_lo
+; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
+; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX11-NEXT: flat_load_b64 v[6:7], v[4:5]
+; GFX11-NEXT: s_mov_b32 s0, 0
+; GFX11-NEXT: .LBB35_1: ; %atomicrmw.start
+; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3]
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] glc
+; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT: buffer_gl1_inv
+; GFX11-NEXT: buffer_gl0_inv
+; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
+; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT: s_cbranch_execnz .LBB35_1
+; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: flat_agent_atomic_fadd_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
+; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX10-NEXT: s_mov_b32 s4, 0
+; GFX10-NEXT: flat_load_dwordx2 v[6:7], v[0:1]
+; GFX10-NEXT: .LBB35_1: ; %atomicrmw.start
+; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3]
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
+; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT: buffer_gl1_inv
+; GFX10-NEXT: buffer_gl0_inv
+; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX10-NEXT: v_mov_b32_e32 v7, v5
+; GFX10-NEXT: v_mov_b32_e32 v6, v4
+; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT: s_cbranch_execnz .LBB35_1
+; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: flat_agent_atomic_fadd_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX90A: ; %bb.0:
+; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
+; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX90A-NEXT: flat_atomic_add_f64 v[0:1], v[2:3]
+; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: buffer_wbinvl1
+; GFX90A-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: flat_agent_atomic_fadd_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX908: ; %bb.0:
+; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT: v_add_co_u32_e32 v8, vcc, 0xfffff800, v0
+; GFX908-NEXT: v_addc_co_u32_e32 v9, vcc, -1, v1, vcc
+; GFX908-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
+; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX908-NEXT: flat_load_dwordx2 v[6:7], v[0:1]
+; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: .LBB35_1: ; %atomicrmw.start
+; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX908-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3]
+; GFX908-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc
+; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX908-NEXT: buffer_wbinvl1
+; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
+; GFX908-NEXT: v_mov_b32_e32 v7, v1
+; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT: v_mov_b32_e32 v6, v0
+; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX908-NEXT: s_cbranch_execnz .LBB35_1
+; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: flat_agent_atomic_fadd_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0xfffff800, v0
+; GFX8-NEXT: v_addc_u32_e32 v9, vcc, -1, v1, vcc
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0xfffff804, v0
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc
+; GFX8-NEXT: flat_load_dword v7, v[0:1]
+; GFX8-NEXT: flat_load_dword v6, v[8:9]
+; GFX8-NEXT: s_mov_b64 s[4:5], 0
+; GFX8-NEXT: .LBB35_1: ; %atomicrmw.start
+; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3]
+; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc
+; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT: buffer_wbinvl1
+; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
+; GFX8-NEXT: v_mov_b32_e32 v7, v1
+; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT: v_mov_b32_e32 v6, v0
+; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX8-NEXT: s_cbranch_execnz .LBB35_1
+; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: flat_agent_atomic_fadd_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_add_i32_e32 v8, vcc, 0xfffff800, v0
+; GFX7-NEXT: v_addc_u32_e32 v9, vcc, -1, v1, vcc
+; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0xfffff804, v0
+; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc
+; GFX7-NEXT: flat_load_dword v7, v[0:1]
+; GFX7-NEXT: flat_load_dword v6, v[8:9]
+; GFX7-NEXT: s_mov_b64 s[4:5], 0
+; GFX7-NEXT: .LBB35_1: ; %atomicrmw.start
+; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3]
+; GFX7-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc
+; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: buffer_wbinvl1
+; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v7, v1
+; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT: v_mov_b32_e32 v6, v0
+; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX7-NEXT: s_cbranch_execnz .LBB35_1
+; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+ %gep = getelementptr double, ptr %ptr, i64 -256
+ %unused = atomicrmw fadd ptr %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+ ret void
+}
+
+; --------------------------------------------------------------------
+; half
+; --------------------------------------------------------------------
+
+define half @flat_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory(ptr %ptr, half %val) #0 {
+; GFX12-LABEL: flat_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_mov_b32_e32 v3, v0
+; GFX12-NEXT: s_mov_b32 s0, 0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX12-NEXT: flat_load_b32 v5, v[0:1]
+; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_not_b32_e32 v4, v4
+; GFX12-NEXT: .LBB36_1: ; %atomicrmw.start
+; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: v_mov_b32_e32 v6, v5
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX12-NEXT: v_add_f16_e32 v5, v5, v2
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX12-NEXT: s_wait_storecnt 0x0
+; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_cbranch_execnz .LBB36_1
+; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: flat_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory:
+; GFX940: ; %bb.0:
+; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT: v_mov_b32_e32 v3, v0
+; GFX940-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX940-NEXT: flat_load_dword v4, v[0:1]
+; GFX940-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX940-NEXT: s_mov_b32 s0, 0xffff
+; GFX940-NEXT: v_lshlrev_b32_e64 v5, v3, s0
+; GFX940-NEXT: v_not_b32_e32 v5, v5
+; GFX940-NEXT: s_mov_b64 s[0:1], 0
+; GFX940-NEXT: .LBB36_1: ; %atomicrmw.start
+; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NEXT: v_mov_b32_e32 v7, v4
+; GFX940-NEXT: v_lshrrev_b32_e32 v4, v3, v7
+; GFX940-NEXT: v_add_f16_e32 v4, v4, v2
+; GFX940-NEXT: v_lshlrev_b32_e32 v4, v3, v4
+; GFX940-NEXT: v_and_or_b32 v6, v7, v5, v4
+; GFX940-NEXT: buffer_wbl2 sc1
+; GFX940-NEXT: flat_atomic_cmpswap v4, v[0:1], v[6:7] sc0
+; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NEXT: buffer_inv sc1
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7
+; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX940-NEXT: s_cbranch_execnz .LBB36_1
+; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX940-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v4
+; GFX940-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: flat_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_mov_b32_e32 v3, v0
+; GFX11-NEXT: s_mov_b32 s0, 0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX11-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX11-NEXT: flat_load_b32 v5, v[0:1]
+; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_not_b32_e32 v4, v4
+; GFX11-NEXT: .LBB36_1: ; %atomicrmw.start
+; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_mov_b32_e32 v6, v5
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX11-NEXT: v_add_f16_e32 v5, v5, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc
+; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT: buffer_gl1_inv
+; GFX11-NEXT: buffer_gl0_inv
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT: s_cbranch_execnz .LBB36_1
+; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: flat_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_mov_b32_e32 v3, v0
+; GFX10-NEXT: s_mov_b32 s4, 0
+; GFX10-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX10-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX10-NEXT: flat_load_dword v5, v[0:1]
+; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX10-NEXT: v_not_b32_e32 v4, v4
+; GFX10-NEXT: .LBB36_1: ; %atomicrmw.start
+; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_mov_b32_e32 v6, v5
+; GFX10-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX10-NEXT: v_add_f16_e32 v5, v5, v2
+; GFX10-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX10-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
+; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT: buffer_gl1_inv
+; GFX10-NEXT: buffer_gl0_inv
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT: s_cbranch_execnz .LBB36_1
+; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: flat_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory:
+; GFX90A: ; %bb.0:
+; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v3, v0
+; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX90A-NEXT: flat_load_dword v4, v[0:1]
+; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX90A-NEXT: s_mov_b32 s4, 0xffff
+; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v3, s4
+; GFX90A-NEXT: v_not_b32_e32 v5, v5
+; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: .LBB36_1: ; %atomicrmw.start
+; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v7, v4
+; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v7
+; GFX90A-NEXT: v_add_f16_e32 v4, v4, v2
+; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4
+; GFX90A-NEXT: v_and_or_b32 v6, v7, v5, v4
+; GFX90A-NEXT: flat_atomic_cmpswap v4, v[0:1], v[6:7] glc
+; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: buffer_wbinvl1
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7
+; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: s_cbranch_execnz .LBB36_1
+; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v4
+; GFX90A-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: flat_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory:
+; GFX908: ; %bb.0:
+; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT: v_mov_b32_e32 v3, v0
+; GFX908-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX908-NEXT: flat_load_dword v4, v[0:1]
+; GFX908-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX908-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX908-NEXT: s_mov_b32 s4, 0xffff
+; GFX908-NEXT: v_lshlrev_b32_e64 v5, v3, s4
+; GFX908-NEXT: v_not_b32_e32 v5, v5
+; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: .LBB36_1: ; %atomicrmw.start
+; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX908-NEXT: v_mov_b32_e32 v7, v4
+; GFX908-NEXT: v_lshrrev_b32_e32 v4, v3, v7
+; GFX908-NEXT: v_add_f16_e32 v4, v4, v2
+; GFX908-NEXT: v_lshlrev_b32_e32 v4, v3, v4
+; GFX908-NEXT: v_and_or_b32 v6, v7, v5, v4
+; GFX908-NEXT: flat_atomic_cmpswap v4, v[0:1], v[6:7] glc
+; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX908-NEXT: buffer_wbinvl1
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7
+; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX908-NEXT: s_cbranch_execnz .LBB36_1
+; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v4
+; GFX908-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: flat_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v3, v0
+; GFX8-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX8-NEXT: flat_load_dword v5, v[0:1]
+; GFX8-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX8-NEXT: s_mov_b32 s4, 0xffff
+; GFX8-NEXT: v_lshlrev_b32_e64 v4, v3, s4
+; GFX8-NEXT: v_not_b32_e32 v4, v4
+; GFX8-NEXT: s_mov_b64 s[4:5], 0
+; GFX8-NEXT: .LBB36_1: ; %atomicrmw.start
+; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v6, v5
+; GFX8-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX8-NEXT: v_add_f16_e32 v5, v5, v2
+; GFX8-NEXT: v_and_b32_e32 v7, v6, v4
+; GFX8-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX8-NEXT: v_or_b32_e32 v5, v7, v5
+; GFX8-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
+; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT: buffer_wbinvl1
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6
+; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX8-NEXT: s_cbranch_execnz .LBB36_1
+; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: flat_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v3, v0
+; GFX7-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX7-NEXT: flat_load_dword v5, v[0:1]
+; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v2
+; GFX7-NEXT: v_and_b32_e32 v2, 3, v3
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2
+; GFX7-NEXT: s_mov_b64 s[4:5], 0
+; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v4
+; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v2
+; GFX7-NEXT: v_not_b32_e32 v4, v4
+; GFX7-NEXT: .LBB36_1: ; %atomicrmw.start
+; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v6, v5
+; GFX7-NEXT: v_lshrrev_b32_e32 v5, v2, v6
+; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5
+; GFX7-NEXT: v_and_b32_e32 v7, v6, v4
+; GFX7-NEXT: v_add_f32_e32 v5, v5, v3
+; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5
+; GFX7-NEXT: v_lshlrev_b32_e32 v5, v2, v5
+; GFX7-NEXT: v_or_b32_e32 v5, v7, v5
+; GFX7-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
+; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: buffer_wbinvl1
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6
+; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX7-NEXT: s_cbranch_execnz .LBB36_1
+; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, v2, v5
+; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+ %result = atomicrmw fadd ptr %ptr, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+ ret half %result
+}
+
+define half @flat_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr %ptr, half %val) #0 {
+; GFX12-LABEL: flat_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX12-NEXT: s_mov_b32 s0, 0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX12-NEXT: flat_load_b32 v5, v[0:1]
+; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_not_b32_e32 v4, v4
+; GFX12-NEXT: .LBB37_1: ; %atomicrmw.start
+; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: v_mov_b32_e32 v6, v5
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX12-NEXT: v_add_f16_e32 v5, v5, v2
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX12-NEXT: s_wait_storecnt 0x0
+; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_cbranch_execnz .LBB37_1
+; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: flat_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX940: ; %bb.0:
+; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT: s_mov_b64 s[0:1], 0x7fe
+; GFX940-NEXT: v_lshl_add_u64 v[6:7], v[0:1], 0, s[0:1]
+; GFX940-NEXT: v_and_b32_e32 v0, -4, v6
+; GFX940-NEXT: v_mov_b32_e32 v1, v7
+; GFX940-NEXT: flat_load_dword v4, v[0:1]
+; GFX940-NEXT: v_and_b32_e32 v3, 3, v6
+; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX940-NEXT: s_mov_b32 s0, 0xffff
+; GFX940-NEXT: v_lshlrev_b32_e64 v5, v3, s0
+; GFX940-NEXT: v_not_b32_e32 v5, v5
+; GFX940-NEXT: s_mov_b64 s[0:1], 0
+; GFX940-NEXT: .LBB37_1: ; %atomicrmw.start
+; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NEXT: v_mov_b32_e32 v7, v4
+; GFX940-NEXT: v_lshrrev_b32_e32 v4, v3, v7
+; GFX940-NEXT: v_add_f16_e32 v4, v4, v2
+; GFX940-NEXT: v_lshlrev_b32_e32 v4, v3, v4
+; GFX940-NEXT: v_and_or_b32 v6, v7, v5, v4
+; GFX940-NEXT: buffer_wbl2 sc1
+; GFX940-NEXT: flat_atomic_cmpswap v4, v[0:1], v[6:7] sc0
+; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NEXT: buffer_inv sc1
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7
+; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX940-NEXT: s_cbranch_execnz .LBB37_1
+; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX940-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v4
+; GFX940-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: flat_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX11-NEXT: s_mov_b32 s0, 0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX11-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX11-NEXT: flat_load_b32 v5, v[0:1]
+; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_not_b32_e32 v4, v4
+; GFX11-NEXT: .LBB37_1: ; %atomicrmw.start
+; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_mov_b32_e32 v6, v5
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX11-NEXT: v_add_f16_e32 v5, v5, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc
+; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT: buffer_gl1_inv
+; GFX11-NEXT: buffer_gl0_inv
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT: s_cbranch_execnz .LBB37_1
+; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: flat_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX10-NEXT: s_mov_b32 s4, 0
+; GFX10-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX10-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX10-NEXT: flat_load_dword v5, v[0:1]
+; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX10-NEXT: v_not_b32_e32 v4, v4
+; GFX10-NEXT: .LBB37_1: ; %atomicrmw.start
+; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_mov_b32_e32 v6, v5
+; GFX10-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX10-NEXT: v_add_f16_e32 v5, v5, v2
+; GFX10-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX10-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
+; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT: buffer_gl1_inv
+; GFX10-NEXT: buffer_gl0_inv
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT: s_cbranch_execnz .LBB37_1
+; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: flat_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX90A: ; %bb.0:
+; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0
+; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX90A-NEXT: flat_load_dword v4, v[0:1]
+; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX90A-NEXT: s_mov_b32 s4, 0xffff
+; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v3, s4
+; GFX90A-NEXT: v_not_b32_e32 v5, v5
+; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: .LBB37_1: ; %atomicrmw.start
+; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v7, v4
+; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v7
+; GFX90A-NEXT: v_add_f16_e32 v4, v4, v2
+; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4
+; GFX90A-NEXT: v_and_or_b32 v6, v7, v5, v4
+; GFX90A-NEXT: flat_atomic_cmpswap v4, v[0:1], v[6:7] glc
+; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: buffer_wbinvl1
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7
+; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: s_cbranch_execnz .LBB37_1
+; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v4
+; GFX90A-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: flat_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX908: ; %bb.0:
+; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0
+; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX908-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX908-NEXT: flat_load_dword v4, v[0:1]
+; GFX908-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX908-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX908-NEXT: s_mov_b32 s4, 0xffff
+; GFX908-NEXT: v_lshlrev_b32_e64 v5, v3, s4
+; GFX908-NEXT: v_not_b32_e32 v5, v5
+; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: .LBB37_1: ; %atomicrmw.start
+; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX908-NEXT: v_mov_b32_e32 v7, v4
+; GFX908-NEXT: v_lshrrev_b32_e32 v4, v3, v7
+; GFX908-NEXT: v_add_f16_e32 v4, v4, v2
+; GFX908-NEXT: v_lshlrev_b32_e32 v4, v3, v4
+; GFX908-NEXT: v_and_or_b32 v6, v7, v5, v4
+; GFX908-NEXT: flat_atomic_cmpswap v4, v[0:1], v[6:7] glc
+; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX908-NEXT: buffer_wbinvl1
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7
+; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX908-NEXT: s_cbranch_execnz .LBB37_1
+; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v4
+; GFX908-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: flat_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fe, v0
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX8-NEXT: flat_load_dword v5, v[0:1]
+; GFX8-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX8-NEXT: s_mov_b32 s4, 0xffff
+; GFX8-NEXT: v_lshlrev_b32_e64 v4, v3, s4
+; GFX8-NEXT: v_not_b32_e32 v4, v4
+; GFX8-NEXT: s_mov_b64 s[4:5], 0
+; GFX8-NEXT: .LBB37_1: ; %atomicrmw.start
+; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v6, v5
+; GFX8-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX8-NEXT: v_add_f16_e32 v5, v5, v2
+; GFX8-NEXT: v_and_b32_e32 v7, v6, v4
+; GFX8-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX8-NEXT: v_or_b32_e32 v5, v7, v5
+; GFX8-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
+; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT: buffer_wbinvl1
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6
+; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX8-NEXT: s_cbranch_execnz .LBB37_1
+; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: flat_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_add_i32_e32 v3, vcc, 0x7fe, v0
+; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX7-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX7-NEXT: flat_load_dword v5, v[0:1]
+; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v2
+; GFX7-NEXT: v_and_b32_e32 v2, 3, v3
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2
+; GFX7-NEXT: s_mov_b64 s[4:5], 0
+; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v4
+; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v2
+; GFX7-NEXT: v_not_b32_e32 v4, v4
+; GFX7-NEXT: .LBB37_1: ; %atomicrmw.start
+; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v6, v5
+; GFX7-NEXT: v_lshrrev_b32_e32 v5, v2, v6
+; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5
+; GFX7-NEXT: v_and_b32_e32 v7, v6, v4
+; GFX7-NEXT: v_add_f32_e32 v5, v5, v3
+; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5
+; GFX7-NEXT: v_lshlrev_b32_e32 v5, v2, v5
+; GFX7-NEXT: v_or_b32_e32 v5, v7, v5
+; GFX7-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
+; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: buffer_wbinvl1
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6
+; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX7-NEXT: s_cbranch_execnz .LBB37_1
+; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, v2, v5
+; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+ %gep = getelementptr half, ptr %ptr, i64 1023
+ %result = atomicrmw fadd ptr %gep, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+ ret half %result
+}
+
+define half @flat_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr %ptr, half %val) #0 {
+; GFX12-LABEL: flat_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
+; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX12-NEXT: s_mov_b32 s0, 0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX12-NEXT: flat_load_b32 v5, v[0:1]
+; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_not_b32_e32 v4, v4
+; GFX12-NEXT: .LBB38_1: ; %atomicrmw.start
+; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: v_mov_b32_e32 v6, v5
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX12-NEXT: v_add_f16_e32 v5, v5, v2
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX12-NEXT: s_wait_storecnt 0x0
+; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_cbranch_execnz .LBB38_1
+; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: flat_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX940: ; %bb.0:
+; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT: s_movk_i32 s0, 0xf800
+; GFX940-NEXT: s_mov_b32 s1, -1
+; GFX940-NEXT: v_lshl_add_u64 v[6:7], v[0:1], 0, s[0:1]
+; GFX940-NEXT: v_and_b32_e32 v0, -4, v6
+; GFX940-NEXT: v_mov_b32_e32 v1, v7
+; GFX940-NEXT: flat_load_dword v4, v[0:1]
+; GFX940-NEXT: v_and_b32_e32 v3, 3, v6
+; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX940-NEXT: s_mov_b32 s0, 0xffff
+; GFX940-NEXT: v_lshlrev_b32_e64 v5, v3, s0
+; GFX940-NEXT: v_not_b32_e32 v5, v5
+; GFX940-NEXT: s_mov_b64 s[0:1], 0
+; GFX940-NEXT: .LBB38_1: ; %atomicrmw.start
+; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NEXT: v_mov_b32_e32 v7, v4
+; GFX940-NEXT: v_lshrrev_b32_e32 v4, v3, v7
+; GFX940-NEXT: v_add_f16_e32 v4, v4, v2
+; GFX940-NEXT: v_lshlrev_b32_e32 v4, v3, v4
+; GFX940-NEXT: v_and_or_b32 v6, v7, v5, v4
+; GFX940-NEXT: buffer_wbl2 sc1
+; GFX940-NEXT: flat_atomic_cmpswap v4, v[0:1], v[6:7] sc0
+; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NEXT: buffer_inv sc1
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7
+; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX940-NEXT: s_cbranch_execnz .LBB38_1
+; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX940-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v4
+; GFX940-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: flat_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
+; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX11-NEXT: s_mov_b32 s0, 0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX11-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX11-NEXT: flat_load_b32 v5, v[0:1]
+; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_not_b32_e32 v4, v4
+; GFX11-NEXT: .LBB38_1: ; %atomicrmw.start
+; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_mov_b32_e32 v6, v5
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX11-NEXT: v_add_f16_e32 v5, v5, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc
+; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT: buffer_gl1_inv
+; GFX11-NEXT: buffer_gl0_inv
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT: s_cbranch_execnz .LBB38_1
+; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: flat_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
+; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX10-NEXT: s_mov_b32 s4, 0
+; GFX10-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX10-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX10-NEXT: flat_load_dword v5, v[0:1]
+; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX10-NEXT: v_not_b32_e32 v4, v4
+; GFX10-NEXT: .LBB38_1: ; %atomicrmw.start
+; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_mov_b32_e32 v6, v5
+; GFX10-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX10-NEXT: v_add_f16_e32 v5, v5, v2
+; GFX10-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX10-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
+; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT: buffer_gl1_inv
+; GFX10-NEXT: buffer_gl0_inv
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT: s_cbranch_execnz .LBB38_1
+; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: flat_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX90A: ; %bb.0:
+; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
+; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX90A-NEXT: flat_load_dword v4, v[0:1]
+; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX90A-NEXT: s_mov_b32 s4, 0xffff
+; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v3, s4
+; GFX90A-NEXT: v_not_b32_e32 v5, v5
+; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: .LBB38_1: ; %atomicrmw.start
+; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v7, v4
+; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v7
+; GFX90A-NEXT: v_add_f16_e32 v4, v4, v2
+; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4
+; GFX90A-NEXT: v_and_or_b32 v6, v7, v5, v4
+; GFX90A-NEXT: flat_atomic_cmpswap v4, v[0:1], v[6:7] glc
+; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: buffer_wbinvl1
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7
+; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: s_cbranch_execnz .LBB38_1
+; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v4
+; GFX90A-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: flat_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX908: ; %bb.0:
+; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
+; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX908-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX908-NEXT: flat_load_dword v4, v[0:1]
+; GFX908-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX908-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX908-NEXT: s_mov_b32 s4, 0xffff
+; GFX908-NEXT: v_lshlrev_b32_e64 v5, v3, s4
+; GFX908-NEXT: v_not_b32_e32 v5, v5
+; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: .LBB38_1: ; %atomicrmw.start
+; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX908-NEXT: v_mov_b32_e32 v7, v4
+; GFX908-NEXT: v_lshrrev_b32_e32 v4, v3, v7
+; GFX908-NEXT: v_add_f16_e32 v4, v4, v2
+; GFX908-NEXT: v_lshlrev_b32_e32 v4, v3, v4
+; GFX908-NEXT: v_and_or_b32 v6, v7, v5, v4
+; GFX908-NEXT: flat_atomic_cmpswap v4, v[0:1], v[6:7] glc
+; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX908-NEXT: buffer_wbinvl1
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7
+; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX908-NEXT: s_cbranch_execnz .LBB38_1
+; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v4
+; GFX908-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: flat_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0xfffff800, v0
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc
+; GFX8-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX8-NEXT: flat_load_dword v5, v[0:1]
+; GFX8-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX8-NEXT: s_mov_b32 s4, 0xffff
+; GFX8-NEXT: v_lshlrev_b32_e64 v4, v3, s4
+; GFX8-NEXT: v_not_b32_e32 v4, v4
+; GFX8-NEXT: s_mov_b64 s[4:5], 0
+; GFX8-NEXT: .LBB38_1: ; %atomicrmw.start
+; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v6, v5
+; GFX8-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX8-NEXT: v_add_f16_e32 v5, v5, v2
+; GFX8-NEXT: v_and_b32_e32 v7, v6, v4
+; GFX8-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX8-NEXT: v_or_b32_e32 v5, v7, v5
+; GFX8-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
+; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT: buffer_wbinvl1
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6
+; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX8-NEXT: s_cbranch_execnz .LBB38_1
+; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: flat_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_add_i32_e32 v3, vcc, 0xfffff800, v0
+; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc
+; GFX7-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX7-NEXT: flat_load_dword v5, v[0:1]
+; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v2
+; GFX7-NEXT: v_and_b32_e32 v2, 3, v3
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2
+; GFX7-NEXT: s_mov_b64 s[4:5], 0
+; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v4
+; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v2
+; GFX7-NEXT: v_not_b32_e32 v4, v4
+; GFX7-NEXT: .LBB38_1: ; %atomicrmw.start
+; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v6, v5
+; GFX7-NEXT: v_lshrrev_b32_e32 v5, v2, v6
+; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5
+; GFX7-NEXT: v_and_b32_e32 v7, v6, v4
+; GFX7-NEXT: v_add_f32_e32 v5, v5, v3
+; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5
+; GFX7-NEXT: v_lshlrev_b32_e32 v5, v2, v5
+; GFX7-NEXT: v_or_b32_e32 v5, v7, v5
+; GFX7-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
+; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: buffer_wbinvl1
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6
+; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX7-NEXT: s_cbranch_execnz .LBB38_1
+; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, v2, v5
+; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+ %gep = getelementptr half, ptr %ptr, i64 -1024
+ %result = atomicrmw fadd ptr %gep, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+ ret half %result
+ }
+
+define void @flat_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory(ptr %ptr, half %val) #0 {
+; GFX12-LABEL: flat_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_mov_b32_e32 v3, v0
+; GFX12-NEXT: s_mov_b32 s0, 0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX12-NEXT: flat_load_b32 v4, v[0:1]
+; GFX12-NEXT: v_lshlrev_b32_e32 v5, 3, v3
+; GFX12-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_not_b32_e32 v6, v3
+; GFX12-NEXT: .LBB39_1: ; %atomicrmw.start
+; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: v_lshrrev_b32_e32 v3, v5, v4
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_add_f16_e32 v3, v3, v2
+; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3
+; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3
+; GFX12-NEXT: s_wait_storecnt 0x0
+; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_cbranch_execnz .LBB39_1
+; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: flat_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory:
+; GFX940: ; %bb.0:
+; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT: v_mov_b32_e32 v3, v0
+; GFX940-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX940-NEXT: flat_load_dword v5, v[0:1]
+; GFX940-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX940-NEXT: s_mov_b32 s0, 0xffff
+; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0
+; GFX940-NEXT: v_not_b32_e32 v6, v4
+; GFX940-NEXT: s_mov_b64 s[0:1], 0
+; GFX940-NEXT: .LBB39_1: ; %atomicrmw.start
+; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NEXT: v_lshrrev_b32_e32 v4, v3, v5
+; GFX940-NEXT: v_add_f16_e32 v4, v4, v2
+; GFX940-NEXT: v_lshlrev_b32_e32 v4, v3, v4
+; GFX940-NEXT: v_and_or_b32 v4, v5, v6, v4
+; GFX940-NEXT: buffer_wbl2 sc1
+; GFX940-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] sc0
+; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NEXT: buffer_inv sc1
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5
+; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX940-NEXT: v_mov_b32_e32 v5, v4
+; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX940-NEXT: s_cbranch_execnz .LBB39_1
+; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX940-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX940-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: flat_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_mov_b32_e32 v3, v0
+; GFX11-NEXT: s_mov_b32 s0, 0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX11-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX11-NEXT: flat_load_b32 v4, v[0:1]
+; GFX11-NEXT: v_lshlrev_b32_e32 v5, 3, v3
+; GFX11-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_not_b32_e32 v6, v3
+; GFX11-NEXT: .LBB39_1: ; %atomicrmw.start
+; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_lshrrev_b32_e32 v3, v5, v4
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_add_f16_e32 v3, v3, v2
+; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_lshlrev_b32_e32 v3, v5, v3
+; GFX11-NEXT: v_and_or_b32 v3, v4, v6, v3
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc
+; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT: buffer_gl1_inv
+; GFX11-NEXT: buffer_gl0_inv
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT: s_cbranch_execnz .LBB39_1
+; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: flat_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_mov_b32_e32 v3, v0
+; GFX10-NEXT: s_mov_b32 s4, 0
+; GFX10-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX10-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX10-NEXT: flat_load_dword v4, v[0:1]
+; GFX10-NEXT: v_lshlrev_b32_e32 v5, 3, v3
+; GFX10-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX10-NEXT: v_not_b32_e32 v6, v3
+; GFX10-NEXT: .LBB39_1: ; %atomicrmw.start
+; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_lshrrev_b32_e32 v3, v5, v4
+; GFX10-NEXT: v_add_f16_e32 v3, v3, v2
+; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX10-NEXT: v_and_or_b32 v3, v4, v6, v3
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT: buffer_gl1_inv
+; GFX10-NEXT: buffer_gl0_inv
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX10-NEXT: v_mov_b32_e32 v4, v3
+; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT: s_cbranch_execnz .LBB39_1
+; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: flat_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory:
+; GFX90A: ; %bb.0:
+; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v3, v0
+; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX90A-NEXT: flat_load_dword v5, v[0:1]
+; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX90A-NEXT: s_mov_b32 s4, 0xffff
+; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4
+; GFX90A-NEXT: v_not_b32_e32 v6, v4
+; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: .LBB39_1: ; %atomicrmw.start
+; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v5
+; GFX90A-NEXT: v_add_f16_e32 v4, v4, v2
+; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4
+; GFX90A-NEXT: v_and_or_b32 v4, v5, v6, v4
+; GFX90A-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] glc
+; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: buffer_wbinvl1
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5
+; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v5, v4
+; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: s_cbranch_execnz .LBB39_1
+; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: flat_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory:
+; GFX908: ; %bb.0:
+; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT: v_mov_b32_e32 v3, v0
+; GFX908-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX908-NEXT: flat_load_dword v4, v[0:1]
+; GFX908-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX908-NEXT: v_lshlrev_b32_e32 v5, 3, v3
+; GFX908-NEXT: s_mov_b32 s4, 0xffff
+; GFX908-NEXT: v_lshlrev_b32_e64 v3, v5, s4
+; GFX908-NEXT: v_not_b32_e32 v6, v3
+; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: .LBB39_1: ; %atomicrmw.start
+; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX908-NEXT: v_lshrrev_b32_e32 v3, v5, v4
+; GFX908-NEXT: v_add_f16_e32 v3, v3, v2
+; GFX908-NEXT: v_lshlrev_b32_e32 v3, v5, v3
+; GFX908-NEXT: v_and_or_b32 v3, v4, v6, v3
+; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX908-NEXT: buffer_wbinvl1
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT: v_mov_b32_e32 v4, v3
+; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX908-NEXT: s_cbranch_execnz .LBB39_1
+; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: flat_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v3, v0
+; GFX8-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX8-NEXT: flat_load_dword v4, v[0:1]
+; GFX8-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX8-NEXT: v_lshlrev_b32_e32 v5, 3, v3
+; GFX8-NEXT: s_mov_b32 s4, 0xffff
+; GFX8-NEXT: v_lshlrev_b32_e64 v3, v5, s4
+; GFX8-NEXT: v_not_b32_e32 v6, v3
+; GFX8-NEXT: s_mov_b64 s[4:5], 0
+; GFX8-NEXT: .LBB39_1: ; %atomicrmw.start
+; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_lshrrev_b32_e32 v3, v5, v4
+; GFX8-NEXT: v_add_f16_e32 v3, v3, v2
+; GFX8-NEXT: v_and_b32_e32 v7, v4, v6
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, v5, v3
+; GFX8-NEXT: v_or_b32_e32 v3, v7, v3
+; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT: buffer_wbinvl1
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT: v_mov_b32_e32 v4, v3
+; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX8-NEXT: s_cbranch_execnz .LBB39_1
+; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: flat_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v3, v0
+; GFX7-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX7-NEXT: flat_load_dword v4, v[0:1]
+; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v2
+; GFX7-NEXT: v_and_b32_e32 v2, 3, v3
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2
+; GFX7-NEXT: v_lshl_b32_e32 v3, 0xffff, v2
+; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5
+; GFX7-NEXT: v_not_b32_e32 v6, v3
+; GFX7-NEXT: s_mov_b64 s[4:5], 0
+; GFX7-NEXT: .LBB39_1: ; %atomicrmw.start
+; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, v2, v4
+; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3
+; GFX7-NEXT: v_and_b32_e32 v7, v4, v6
+; GFX7-NEXT: v_add_f32_e32 v3, v3, v5
+; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, v2, v3
+; GFX7-NEXT: v_or_b32_e32 v3, v7, v3
+; GFX7-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: buffer_wbinvl1
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT: v_mov_b32_e32 v4, v3
+; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX7-NEXT: s_cbranch_execnz .LBB39_1
+; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+ %unused = atomicrmw fadd ptr %ptr, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+ ret void
+}
+
+define void @flat_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr %ptr, half %val) #0 {
+; GFX12-LABEL: flat_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX12-NEXT: s_mov_b32 s0, 0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX12-NEXT: flat_load_b32 v4, v[0:1]
+; GFX12-NEXT: v_lshlrev_b32_e32 v5, 3, v3
+; GFX12-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_not_b32_e32 v6, v3
+; GFX12-NEXT: .LBB40_1: ; %atomicrmw.start
+; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: v_lshrrev_b32_e32 v3, v5, v4
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_add_f16_e32 v3, v3, v2
+; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3
+; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3
+; GFX12-NEXT: s_wait_storecnt 0x0
+; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_cbranch_execnz .LBB40_1
+; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: flat_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX940: ; %bb.0:
+; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT: s_mov_b64 s[0:1], 0x7fe
+; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
+; GFX940-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX940-NEXT: v_mov_b32_e32 v1, v5
+; GFX940-NEXT: flat_load_dword v5, v[0:1]
+; GFX940-NEXT: v_and_b32_e32 v3, 3, v4
+; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX940-NEXT: s_mov_b32 s0, 0xffff
+; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0
+; GFX940-NEXT: v_not_b32_e32 v6, v4
+; GFX940-NEXT: s_mov_b64 s[0:1], 0
+; GFX940-NEXT: .LBB40_1: ; %atomicrmw.start
+; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NEXT: v_lshrrev_b32_e32 v4, v3, v5
+; GFX940-NEXT: v_add_f16_e32 v4, v4, v2
+; GFX940-NEXT: v_lshlrev_b32_e32 v4, v3, v4
+; GFX940-NEXT: v_and_or_b32 v4, v5, v6, v4
+; GFX940-NEXT: buffer_wbl2 sc1
+; GFX940-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] sc0
+; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NEXT: buffer_inv sc1
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5
+; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX940-NEXT: v_mov_b32_e32 v5, v4
+; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX940-NEXT: s_cbranch_execnz .LBB40_1
+; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX940-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX940-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: flat_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX11-NEXT: s_mov_b32 s0, 0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX11-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX11-NEXT: flat_load_b32 v4, v[0:1]
+; GFX11-NEXT: v_lshlrev_b32_e32 v5, 3, v3
+; GFX11-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_not_b32_e32 v6, v3
+; GFX11-NEXT: .LBB40_1: ; %atomicrmw.start
+; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_lshrrev_b32_e32 v3, v5, v4
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_add_f16_e32 v3, v3, v2
+; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_lshlrev_b32_e32 v3, v5, v3
+; GFX11-NEXT: v_and_or_b32 v3, v4, v6, v3
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc
+; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT: buffer_gl1_inv
+; GFX11-NEXT: buffer_gl0_inv
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT: s_cbranch_execnz .LBB40_1
+; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: flat_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX10-NEXT: s_mov_b32 s4, 0
+; GFX10-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX10-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX10-NEXT: flat_load_dword v4, v[0:1]
+; GFX10-NEXT: v_lshlrev_b32_e32 v5, 3, v3
+; GFX10-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX10-NEXT: v_not_b32_e32 v6, v3
+; GFX10-NEXT: .LBB40_1: ; %atomicrmw.start
+; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_lshrrev_b32_e32 v3, v5, v4
+; GFX10-NEXT: v_add_f16_e32 v3, v3, v2
+; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX10-NEXT: v_and_or_b32 v3, v4, v6, v3
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT: buffer_gl1_inv
+; GFX10-NEXT: buffer_gl0_inv
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX10-NEXT: v_mov_b32_e32 v4, v3
+; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT: s_cbranch_execnz .LBB40_1
+; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: flat_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX90A: ; %bb.0:
+; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0
+; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX90A-NEXT: flat_load_dword v5, v[0:1]
+; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX90A-NEXT: s_mov_b32 s4, 0xffff
+; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4
+; GFX90A-NEXT: v_not_b32_e32 v6, v4
+; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: .LBB40_1: ; %atomicrmw.start
+; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v5
+; GFX90A-NEXT: v_add_f16_e32 v4, v4, v2
+; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4
+; GFX90A-NEXT: v_and_or_b32 v4, v5, v6, v4
+; GFX90A-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] glc
+; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: buffer_wbinvl1
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5
+; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v5, v4
+; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: s_cbranch_execnz .LBB40_1
+; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: flat_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX908: ; %bb.0:
+; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0
+; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX908-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX908-NEXT: flat_load_dword v4, v[0:1]
+; GFX908-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX908-NEXT: v_lshlrev_b32_e32 v5, 3, v3
+; GFX908-NEXT: s_mov_b32 s4, 0xffff
+; GFX908-NEXT: v_lshlrev_b32_e64 v3, v5, s4
+; GFX908-NEXT: v_not_b32_e32 v6, v3
+; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: .LBB40_1: ; %atomicrmw.start
+; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX908-NEXT: v_lshrrev_b32_e32 v3, v5, v4
+; GFX908-NEXT: v_add_f16_e32 v3, v3, v2
+; GFX908-NEXT: v_lshlrev_b32_e32 v3, v5, v3
+; GFX908-NEXT: v_and_or_b32 v3, v4, v6, v3
+; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX908-NEXT: buffer_wbinvl1
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT: v_mov_b32_e32 v4, v3
+; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX908-NEXT: s_cbranch_execnz .LBB40_1
+; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: flat_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fe, v0
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX8-NEXT: flat_load_dword v4, v[0:1]
+; GFX8-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX8-NEXT: v_lshlrev_b32_e32 v5, 3, v3
+; GFX8-NEXT: s_mov_b32 s4, 0xffff
+; GFX8-NEXT: v_lshlrev_b32_e64 v3, v5, s4
+; GFX8-NEXT: v_not_b32_e32 v6, v3
+; GFX8-NEXT: s_mov_b64 s[4:5], 0
+; GFX8-NEXT: .LBB40_1: ; %atomicrmw.start
+; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_lshrrev_b32_e32 v3, v5, v4
+; GFX8-NEXT: v_add_f16_e32 v3, v3, v2
+; GFX8-NEXT: v_and_b32_e32 v7, v4, v6
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, v5, v3
+; GFX8-NEXT: v_or_b32_e32 v3, v7, v3
+; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT: buffer_wbinvl1
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT: v_mov_b32_e32 v4, v3
+; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX8-NEXT: s_cbranch_execnz .LBB40_1
+; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: flat_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0
+; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX7-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX7-NEXT: flat_load_dword v3, v[0:1]
+; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX7-NEXT: s_mov_b64 s[4:5], 0
+; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2
+; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v4
+; GFX7-NEXT: v_not_b32_e32 v6, v2
+; GFX7-NEXT: .LBB40_1: ; %atomicrmw.start
+; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GFX7-NEXT: v_and_b32_e32 v7, v3, v6
+; GFX7-NEXT: v_add_f32_e32 v2, v2, v5
+; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, v4, v2
+; GFX7-NEXT: v_or_b32_e32 v2, v7, v2
+; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: buffer_wbinvl1
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT: v_mov_b32_e32 v3, v2
+; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX7-NEXT: s_cbranch_execnz .LBB40_1
+; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+ %gep = getelementptr half, ptr %ptr, i64 1023
+ %unused = atomicrmw fadd ptr %gep, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+ ret void
+}
+
+define void @flat_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr %ptr, half %val) #0 {
+; GFX12-LABEL: flat_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
+; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX12-NEXT: s_mov_b32 s0, 0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX12-NEXT: flat_load_b32 v4, v[0:1]
+; GFX12-NEXT: v_lshlrev_b32_e32 v5, 3, v3
+; GFX12-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_not_b32_e32 v6, v3
+; GFX12-NEXT: .LBB41_1: ; %atomicrmw.start
+; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: v_lshrrev_b32_e32 v3, v5, v4
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_add_f16_e32 v3, v3, v2
+; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3
+; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3
+; GFX12-NEXT: s_wait_storecnt 0x0
+; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_cbranch_execnz .LBB41_1
+; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: flat_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX940: ; %bb.0:
+; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT: s_movk_i32 s0, 0xf800
+; GFX940-NEXT: s_mov_b32 s1, -1
+; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
+; GFX940-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX940-NEXT: v_mov_b32_e32 v1, v5
+; GFX940-NEXT: flat_load_dword v5, v[0:1]
+; GFX940-NEXT: v_and_b32_e32 v3, 3, v4
+; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX940-NEXT: s_mov_b32 s0, 0xffff
+; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0
+; GFX940-NEXT: v_not_b32_e32 v6, v4
+; GFX940-NEXT: s_mov_b64 s[0:1], 0
+; GFX940-NEXT: .LBB41_1: ; %atomicrmw.start
+; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NEXT: v_lshrrev_b32_e32 v4, v3, v5
+; GFX940-NEXT: v_add_f16_e32 v4, v4, v2
+; GFX940-NEXT: v_lshlrev_b32_e32 v4, v3, v4
+; GFX940-NEXT: v_and_or_b32 v4, v5, v6, v4
+; GFX940-NEXT: buffer_wbl2 sc1
+; GFX940-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] sc0
+; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NEXT: buffer_inv sc1
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5
+; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX940-NEXT: v_mov_b32_e32 v5, v4
+; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX940-NEXT: s_cbranch_execnz .LBB41_1
+; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX940-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX940-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: flat_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
+; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX11-NEXT: s_mov_b32 s0, 0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX11-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX11-NEXT: flat_load_b32 v4, v[0:1]
+; GFX11-NEXT: v_lshlrev_b32_e32 v5, 3, v3
+; GFX11-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_not_b32_e32 v6, v3
+; GFX11-NEXT: .LBB41_1: ; %atomicrmw.start
+; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_lshrrev_b32_e32 v3, v5, v4
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_add_f16_e32 v3, v3, v2
+; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_lshlrev_b32_e32 v3, v5, v3
+; GFX11-NEXT: v_and_or_b32 v3, v4, v6, v3
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc
+; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT: buffer_gl1_inv
+; GFX11-NEXT: buffer_gl0_inv
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT: s_cbranch_execnz .LBB41_1
+; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: flat_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
+; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX10-NEXT: s_mov_b32 s4, 0
+; GFX10-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX10-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX10-NEXT: flat_load_dword v4, v[0:1]
+; GFX10-NEXT: v_lshlrev_b32_e32 v5, 3, v3
+; GFX10-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX10-NEXT: v_not_b32_e32 v6, v3
+; GFX10-NEXT: .LBB41_1: ; %atomicrmw.start
+; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_lshrrev_b32_e32 v3, v5, v4
+; GFX10-NEXT: v_add_f16_e32 v3, v3, v2
+; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX10-NEXT: v_and_or_b32 v3, v4, v6, v3
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT: buffer_gl1_inv
+; GFX10-NEXT: buffer_gl0_inv
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX10-NEXT: v_mov_b32_e32 v4, v3
+; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT: s_cbranch_execnz .LBB41_1
+; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: flat_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX90A: ; %bb.0:
+; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
+; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX90A-NEXT: flat_load_dword v5, v[0:1]
+; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX90A-NEXT: s_mov_b32 s4, 0xffff
+; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4
+; GFX90A-NEXT: v_not_b32_e32 v6, v4
+; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: .LBB41_1: ; %atomicrmw.start
+; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v5
+; GFX90A-NEXT: v_add_f16_e32 v4, v4, v2
+; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4
+; GFX90A-NEXT: v_and_or_b32 v4, v5, v6, v4
+; GFX90A-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] glc
+; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: buffer_wbinvl1
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5
+; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v5, v4
+; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: s_cbranch_execnz .LBB41_1
+; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: flat_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX908: ; %bb.0:
+; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
+; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX908-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX908-NEXT: flat_load_dword v4, v[0:1]
+; GFX908-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX908-NEXT: v_lshlrev_b32_e32 v5, 3, v3
+; GFX908-NEXT: s_mov_b32 s4, 0xffff
+; GFX908-NEXT: v_lshlrev_b32_e64 v3, v5, s4
+; GFX908-NEXT: v_not_b32_e32 v6, v3
+; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: .LBB41_1: ; %atomicrmw.start
+; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX908-NEXT: v_lshrrev_b32_e32 v3, v5, v4
+; GFX908-NEXT: v_add_f16_e32 v3, v3, v2
+; GFX908-NEXT: v_lshlrev_b32_e32 v3, v5, v3
+; GFX908-NEXT: v_and_or_b32 v3, v4, v6, v3
+; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX908-NEXT: buffer_wbinvl1
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT: v_mov_b32_e32 v4, v3
+; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX908-NEXT: s_cbranch_execnz .LBB41_1
+; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: flat_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0xfffff800, v0
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc
+; GFX8-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX8-NEXT: flat_load_dword v4, v[0:1]
+; GFX8-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX8-NEXT: v_lshlrev_b32_e32 v5, 3, v3
+; GFX8-NEXT: s_mov_b32 s4, 0xffff
+; GFX8-NEXT: v_lshlrev_b32_e64 v3, v5, s4
+; GFX8-NEXT: v_not_b32_e32 v6, v3
+; GFX8-NEXT: s_mov_b64 s[4:5], 0
+; GFX8-NEXT: .LBB41_1: ; %atomicrmw.start
+; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_lshrrev_b32_e32 v3, v5, v4
+; GFX8-NEXT: v_add_f16_e32 v3, v3, v2
+; GFX8-NEXT: v_and_b32_e32 v7, v4, v6
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, v5, v3
+; GFX8-NEXT: v_or_b32_e32 v3, v7, v3
+; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT: buffer_wbinvl1
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT: v_mov_b32_e32 v4, v3
+; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX8-NEXT: s_cbranch_execnz .LBB41_1
+; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: flat_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0xfffff800, v0
+; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc
+; GFX7-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX7-NEXT: flat_load_dword v3, v[0:1]
+; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX7-NEXT: s_mov_b64 s[4:5], 0
+; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2
+; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v4
+; GFX7-NEXT: v_not_b32_e32 v6, v2
+; GFX7-NEXT: .LBB41_1: ; %atomicrmw.start
+; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GFX7-NEXT: v_and_b32_e32 v7, v3, v6
+; GFX7-NEXT: v_add_f32_e32 v2, v2, v5
+; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, v4, v2
+; GFX7-NEXT: v_or_b32_e32 v2, v7, v2
+; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: buffer_wbinvl1
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT: v_mov_b32_e32 v3, v2
+; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX7-NEXT: s_cbranch_execnz .LBB41_1
+; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+ %gep = getelementptr half, ptr %ptr, i64 -1024
+ %unused = atomicrmw fadd ptr %gep, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+ ret void
+}
+
+define void @flat_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory(ptr %ptr, half %val) #0 {
+; GFX12-LABEL: flat_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: flat_load_b32 v4, v[0:1] offset:2046
+; GFX12-NEXT: s_mov_b32 s0, 0
+; GFX12-NEXT: .LBB42_1: ; %atomicrmw.start
+; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: v_add_f16_e32 v3, v4, v2
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX12-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
+; GFX12-NEXT: s_wait_storecnt 0x0
+; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 th:TH_ATOMIC_RETURN
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_cbranch_execnz .LBB42_1
+; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: flat_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
+; GFX940: ; %bb.0:
+; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT: flat_load_dword v5, v[0:1] offset:2046
+; GFX940-NEXT: s_mov_b64 s[0:1], 0
+; GFX940-NEXT: s_mov_b32 s2, 0xffff0000
+; GFX940-NEXT: .LBB42_1: ; %atomicrmw.start
+; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NEXT: v_add_f16_e32 v3, v5, v2
+; GFX940-NEXT: v_and_or_b32 v4, v5, s2, v3
+; GFX940-NEXT: buffer_wbl2 sc1
+; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2046 sc0
+; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NEXT: buffer_inv sc1
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX940-NEXT: v_mov_b32_e32 v5, v3
+; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX940-NEXT: s_cbranch_execnz .LBB42_1
+; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX940-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX940-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: flat_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: flat_load_b32 v4, v[0:1] offset:2046
+; GFX11-NEXT: s_mov_b32 s0, 0
+; GFX11-NEXT: .LBB42_1: ; %atomicrmw.start
+; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_add_f16_e32 v3, v4, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX11-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 glc
+; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT: buffer_gl1_inv
+; GFX11-NEXT: buffer_gl0_inv
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT: s_cbranch_execnz .LBB42_1
+; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: flat_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fe, v0
+; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX10-NEXT: s_mov_b32 s4, 0
+; GFX10-NEXT: flat_load_dword v4, v[0:1]
+; GFX10-NEXT: .LBB42_1: ; %atomicrmw.start
+; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_add_f16_e32 v3, v4, v2
+; GFX10-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX10-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT: buffer_gl1_inv
+; GFX10-NEXT: buffer_gl0_inv
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX10-NEXT: v_mov_b32_e32 v4, v3
+; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT: s_cbranch_execnz .LBB42_1
+; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: flat_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
+; GFX90A: ; %bb.0:
+; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: flat_load_dword v5, v[0:1] offset:2046
+; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: s_mov_b32 s6, 0xffff0000
+; GFX90A-NEXT: .LBB42_1: ; %atomicrmw.start
+; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_add_f16_e32 v3, v5, v2
+; GFX90A-NEXT: v_and_or_b32 v4, v5, s6, v3
+; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2046 glc
+; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: buffer_wbinvl1
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v5, v3
+; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: s_cbranch_execnz .LBB42_1
+; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: flat_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
+; GFX908: ; %bb.0:
+; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT: flat_load_dword v4, v[0:1] offset:2046
+; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: s_mov_b32 s6, 0xffff0000
+; GFX908-NEXT: .LBB42_1: ; %atomicrmw.start
+; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX908-NEXT: v_add_f16_e32 v3, v4, v2
+; GFX908-NEXT: v_and_or_b32 v3, v4, s6, v3
+; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2046 glc
+; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX908-NEXT: buffer_wbinvl1
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT: v_mov_b32_e32 v4, v3
+; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX908-NEXT: s_cbranch_execnz .LBB42_1
+; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: flat_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fe, v0
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT: flat_load_dword v4, v[0:1]
+; GFX8-NEXT: s_mov_b64 s[4:5], 0
+; GFX8-NEXT: .LBB42_1: ; %atomicrmw.start
+; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_add_f16_e32 v3, v4, v2
+; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
+; GFX8-NEXT: v_or_b32_e32 v3, v5, v3
+; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT: buffer_wbinvl1
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT: v_mov_b32_e32 v4, v3
+; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX8-NEXT: s_cbranch_execnz .LBB42_1
+; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: flat_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7fe, v0
+; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX7-NEXT: flat_load_dword v3, v[0:1]
+; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT: s_mov_b64 s[4:5], 0
+; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v2
+; GFX7-NEXT: .LBB42_1: ; %atomicrmw.start
+; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v3
+; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
+; GFX7-NEXT: v_add_f32_e32 v2, v2, v4
+; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT: v_or_b32_e32 v2, v5, v2
+; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: buffer_wbinvl1
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT: v_mov_b32_e32 v3, v2
+; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX7-NEXT: s_cbranch_execnz .LBB42_1
+; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+ %gep = getelementptr half, ptr %ptr, i64 1023
+ %unused = atomicrmw fadd ptr %gep, half %val syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0
+ ret void
+}
+
+define half @flat_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory(ptr %ptr, half %val) #0 {
+; GFX12-LABEL: flat_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2046
+; GFX12-NEXT: s_mov_b32 s0, 0
+; GFX12-NEXT: .LBB43_1: ; %atomicrmw.start
+; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_add_f16_e32 v3, v4, v2
+; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
+; GFX12-NEXT: s_wait_storecnt 0x0
+; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 th:TH_ATOMIC_RETURN
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_cbranch_execnz .LBB43_1
+; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: flat_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
+; GFX940: ; %bb.0:
+; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT: flat_load_dword v3, v[0:1] offset:2046
+; GFX940-NEXT: s_mov_b64 s[0:1], 0
+; GFX940-NEXT: s_mov_b32 s2, 0xffff0000
+; GFX940-NEXT: .LBB43_1: ; %atomicrmw.start
+; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NEXT: v_mov_b32_e32 v5, v3
+; GFX940-NEXT: v_add_f16_e32 v3, v5, v2
+; GFX940-NEXT: v_and_or_b32 v4, v5, s2, v3
+; GFX940-NEXT: buffer_wbl2 sc1
+; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2046 sc0
+; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NEXT: buffer_inv sc1
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX940-NEXT: s_cbranch_execnz .LBB43_1
+; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX940-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX940-NEXT: v_mov_b32_e32 v0, v3
+; GFX940-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: flat_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2046
+; GFX11-NEXT: s_mov_b32 s0, 0
+; GFX11-NEXT: .LBB43_1: ; %atomicrmw.start
+; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_add_f16_e32 v3, v4, v2
+; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 glc
+; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT: buffer_gl1_inv
+; GFX11-NEXT: buffer_gl0_inv
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT: s_cbranch_execnz .LBB43_1
+; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT: v_mov_b32_e32 v0, v3
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: flat_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, 0, v1, vcc_lo
+; GFX10-NEXT: s_mov_b32 s4, 0
+; GFX10-NEXT: flat_load_dword v0, v[3:4]
+; GFX10-NEXT: .LBB43_1: ; %atomicrmw.start
+; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_mov_b32_e32 v1, v0
+; GFX10-NEXT: v_add_f16_e32 v0, v1, v2
+; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX10-NEXT: v_and_or_b32 v0, 0xffff0000, v1, v0
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
+; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT: buffer_gl1_inv
+; GFX10-NEXT: buffer_gl0_inv
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
+; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT: s_cbranch_execnz .LBB43_1
+; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: flat_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
+; GFX90A: ; %bb.0:
+; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2046
+; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: s_mov_b32 s6, 0xffff0000
+; GFX90A-NEXT: .LBB43_1: ; %atomicrmw.start
+; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v5, v3
+; GFX90A-NEXT: v_add_f16_e32 v3, v5, v2
+; GFX90A-NEXT: v_and_or_b32 v4, v5, s6, v3
+; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2046 glc
+; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: buffer_wbinvl1
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: s_cbranch_execnz .LBB43_1
+; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v3
+; GFX90A-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: flat_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
+; GFX908: ; %bb.0:
+; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2046
+; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: s_mov_b32 s6, 0xffff0000
+; GFX908-NEXT: .LBB43_1: ; %atomicrmw.start
+; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX908-NEXT: v_mov_b32_e32 v4, v3
+; GFX908-NEXT: v_add_f16_e32 v3, v4, v2
+; GFX908-NEXT: v_and_or_b32 v3, v4, s6, v3
+; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2046 glc
+; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX908-NEXT: buffer_wbinvl1
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX908-NEXT: s_cbranch_execnz .LBB43_1
+; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT: v_mov_b32_e32 v0, v3
+; GFX908-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: flat_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fe, v0
+; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc
+; GFX8-NEXT: flat_load_dword v0, v[3:4]
+; GFX8-NEXT: s_mov_b64 s[4:5], 0
+; GFX8-NEXT: .LBB43_1: ; %atomicrmw.start
+; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v1, v0
+; GFX8-NEXT: v_add_f16_e32 v0, v1, v2
+; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
+; GFX8-NEXT: v_or_b32_e32 v0, v5, v0
+; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
+; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT: buffer_wbinvl1
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX8-NEXT: s_cbranch_execnz .LBB43_1
+; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: flat_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7fe, v0
+; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX7-NEXT: flat_load_dword v3, v[0:1]
+; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT: s_mov_b64 s[4:5], 0
+; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GFX7-NEXT: .LBB43_1: ; %atomicrmw.start
+; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v4, v3
+; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v4
+; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
+; GFX7-NEXT: v_add_f32_e32 v3, v3, v2
+; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX7-NEXT: v_or_b32_e32 v3, v5, v3
+; GFX7-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT: buffer_wbinvl1
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX7-NEXT: s_cbranch_execnz .LBB43_1
+; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v3
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+ %gep = getelementptr half, ptr %ptr, i64 1023
+ %result = atomicrmw fadd ptr %gep, half %val syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0
+ ret half %result
+}
+
+define half @flat_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr %ptr, half %val) #0 {
+; GFX12-LABEL: flat_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX12-NEXT: s_mov_b32 s0, 0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX12-NEXT: flat_load_b32 v5, v[0:1]
+; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_not_b32_e32 v4, v4
+; GFX12-NEXT: .LBB44_1: ; %atomicrmw.start
+; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: v_mov_b32_e32 v6, v5
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX12-NEXT: v_add_f16_e32 v5, v5, v2
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX12-NEXT: s_wait_storecnt 0x0
+; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: global_inv scope:SCOPE_SYS
+; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_cbranch_execnz .LBB44_1
+; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: flat_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX940: ; %bb.0:
+; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT: s_mov_b64 s[0:1], 0x7fe
+; GFX940-NEXT: v_lshl_add_u64 v[6:7], v[0:1], 0, s[0:1]
+; GFX940-NEXT: v_and_b32_e32 v0, -4, v6
+; GFX940-NEXT: v_mov_b32_e32 v1, v7
+; GFX940-NEXT: flat_load_dword v4, v[0:1]
+; GFX940-NEXT: v_and_b32_e32 v3, 3, v6
+; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX940-NEXT: s_mov_b32 s0, 0xffff
+; GFX940-NEXT: v_lshlrev_b32_e64 v5, v3, s0
+; GFX940-NEXT: v_not_b32_e32 v5, v5
+; GFX940-NEXT: s_mov_b64 s[0:1], 0
+; GFX940-NEXT: .LBB44_1: ; %atomicrmw.start
+; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NEXT: v_mov_b32_e32 v7, v4
+; GFX940-NEXT: v_lshrrev_b32_e32 v4, v3, v7
+; GFX940-NEXT: v_add_f16_e32 v4, v4, v2
+; GFX940-NEXT: v_lshlrev_b32_e32 v4, v3, v4
+; GFX940-NEXT: v_and_or_b32 v6, v7, v5, v4
+; GFX940-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-NEXT: flat_atomic_cmpswap v4, v[0:1], v[6:7] sc0 sc1
+; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NEXT: buffer_inv sc0 sc1
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7
+; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX940-NEXT: s_cbranch_execnz .LBB44_1
+; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX940-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v4
+; GFX940-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: flat_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX11-NEXT: s_mov_b32 s0, 0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX11-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX11-NEXT: flat_load_b32 v5, v[0:1]
+; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_not_b32_e32 v4, v4
+; GFX11-NEXT: .LBB44_1: ; %atomicrmw.start
+; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_mov_b32_e32 v6, v5
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX11-NEXT: v_add_f16_e32 v5, v5, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc
+; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT: buffer_gl1_inv
+; GFX11-NEXT: buffer_gl0_inv
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT: s_cbranch_execnz .LBB44_1
+; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: flat_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX10-NEXT: s_mov_b32 s4, 0
+; GFX10-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX10-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX10-NEXT: flat_load_dword v5, v[0:1]
+; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX10-NEXT: v_not_b32_e32 v4, v4
+; GFX10-NEXT: .LBB44_1: ; %atomicrmw.start
+; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_mov_b32_e32 v6, v5
+; GFX10-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX10-NEXT: v_add_f16_e32 v5, v5, v2
+; GFX10-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX10-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
+; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT: buffer_gl1_inv
+; GFX10-NEXT: buffer_gl0_inv
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT: s_cbranch_execnz .LBB44_1
+; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: flat_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX90A: ; %bb.0:
+; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0
+; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX90A-NEXT: flat_load_dword v4, v[0:1]
+; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX90A-NEXT: s_mov_b32 s4, 0xffff
+; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v3, s4
+; GFX90A-NEXT: v_not_b32_e32 v5, v5
+; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: .LBB44_1: ; %atomicrmw.start
+; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v7, v4
+; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v7
+; GFX90A-NEXT: v_add_f16_e32 v4, v4, v2
+; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4
+; GFX90A-NEXT: v_and_or_b32 v6, v7, v5, v4
+; GFX90A-NEXT: buffer_wbl2
+; GFX90A-NEXT: flat_atomic_cmpswap v4, v[0:1], v[6:7] glc
+; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: buffer_invl2
+; GFX90A-NEXT: buffer_wbinvl1
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7
+; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: s_cbranch_execnz .LBB44_1
+; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v4
+; GFX90A-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: flat_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX908: ; %bb.0:
+; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0
+; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX908-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX908-NEXT: flat_load_dword v4, v[0:1]
+; GFX908-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX908-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX908-NEXT: s_mov_b32 s4, 0xffff
+; GFX908-NEXT: v_lshlrev_b32_e64 v5, v3, s4
+; GFX908-NEXT: v_not_b32_e32 v5, v5
+; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: .LBB44_1: ; %atomicrmw.start
+; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX908-NEXT: v_mov_b32_e32 v7, v4
+; GFX908-NEXT: v_lshrrev_b32_e32 v4, v3, v7
+; GFX908-NEXT: v_add_f16_e32 v4, v4, v2
+; GFX908-NEXT: v_lshlrev_b32_e32 v4, v3, v4
+; GFX908-NEXT: v_and_or_b32 v6, v7, v5, v4
+; GFX908-NEXT: flat_atomic_cmpswap v4, v[0:1], v[6:7] glc
+; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX908-NEXT: buffer_wbinvl1
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7
+; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX908-NEXT: s_cbranch_execnz .LBB44_1
+; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v4
+; GFX908-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: flat_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fe, v0
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX8-NEXT: flat_load_dword v5, v[0:1]
+; GFX8-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX8-NEXT: s_mov_b32 s4, 0xffff
+; GFX8-NEXT: v_lshlrev_b32_e64 v4, v3, s4
+; GFX8-NEXT: v_not_b32_e32 v4, v4
+; GFX8-NEXT: s_mov_b64 s[4:5], 0
+; GFX8-NEXT: .LBB44_1: ; %atomicrmw.start
+; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v6, v5
+; GFX8-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX8-NEXT: v_add_f16_e32 v5, v5, v2
+; GFX8-NEXT: v_and_b32_e32 v7, v6, v4
+; GFX8-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX8-NEXT: v_or_b32_e32 v5, v7, v5
+; GFX8-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
+; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT: buffer_wbinvl1
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6
+; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX8-NEXT: s_cbranch_execnz .LBB44_1
+; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: flat_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_add_i32_e32 v3, vcc, 0x7fe, v0
+; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX7-NEXT: v_and_b32_e32 v0, -4, v3
; GFX7-NEXT: flat_load_dword v5, v[0:1]
; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v2
@@ -4503,7 +9402,7 @@ define half @flat_agent_atomic_fadd_ret_f16(ptr %ptr, half %val) #0 {
; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v4
; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v2
; GFX7-NEXT: v_not_b32_e32 v4, v4
-; GFX7-NEXT: .LBB22_1: ; %atomicrmw.start
+; GFX7-NEXT: .LBB44_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v6, v5
@@ -4520,18 +9419,19 @@ define half @flat_agent_atomic_fadd_ret_f16(ptr %ptr, half %val) #0 {
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT: s_cbranch_execnz .LBB22_1
+; GFX7-NEXT: s_cbranch_execnz .LBB44_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX7-NEXT: v_lshrrev_b32_e32 v0, v2, v5
; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
- %result = atomicrmw fadd ptr %ptr, half %val syncscope("agent") seq_cst
+ %gep = getelementptr half, ptr %ptr, i64 1023
+ %result = atomicrmw fadd ptr %gep, half %val seq_cst, !amdgpu.no.fine.grained.memory !0
ret half %result
}
-define half @flat_agent_atomic_fadd_ret_f16__offset12b_pos(ptr %ptr, half %val) #0 {
-; GFX12-LABEL: flat_agent_atomic_fadd_ret_f16__offset12b_pos:
+define void @flat_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr %ptr, half %val) #0 {
+; GFX12-LABEL: flat_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: s_wait_expcnt 0x0
@@ -4544,73 +9444,70 @@ define half @flat_agent_atomic_fadd_ret_f16__offset12b_pos(ptr %ptr, half %val)
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX12-NEXT: flat_load_b32 v5, v[0:1]
-; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3
-; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX12-NEXT: flat_load_b32 v4, v[0:1]
+; GFX12-NEXT: v_lshlrev_b32_e32 v5, 3, v3
+; GFX12-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_not_b32_e32 v4, v4
-; GFX12-NEXT: .LBB23_1: ; %atomicrmw.start
+; GFX12-NEXT: v_not_b32_e32 v6, v3
+; GFX12-NEXT: .LBB45_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v6, v5
+; GFX12-NEXT: v_lshrrev_b32_e32 v3, v5, v4
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6
-; GFX12-NEXT: v_add_f16_e32 v5, v5, v2
+; GFX12-NEXT: v_add_f16_e32 v3, v3, v2
+; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3
+; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN
+; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-NEXT: global_inv scope:SCOPE_SYS
+; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT: v_mov_b32_e32 v4, v3
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB23_1
+; GFX12-NEXT: s_cbranch_execnz .LBB45_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
-; GFX940-LABEL: flat_agent_atomic_fadd_ret_f16__offset12b_pos:
+; GFX940-LABEL: flat_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: s_mov_b64 s[0:1], 0x7fe
-; GFX940-NEXT: v_lshl_add_u64 v[6:7], v[0:1], 0, s[0:1]
-; GFX940-NEXT: v_and_b32_e32 v0, -4, v6
-; GFX940-NEXT: v_mov_b32_e32 v1, v7
-; GFX940-NEXT: flat_load_dword v4, v[0:1]
-; GFX940-NEXT: v_and_b32_e32 v3, 3, v6
+; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
+; GFX940-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX940-NEXT: v_mov_b32_e32 v1, v5
+; GFX940-NEXT: flat_load_dword v5, v[0:1]
+; GFX940-NEXT: v_and_b32_e32 v3, 3, v4
; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3
; GFX940-NEXT: s_mov_b32 s0, 0xffff
-; GFX940-NEXT: v_lshlrev_b32_e64 v5, v3, s0
-; GFX940-NEXT: v_not_b32_e32 v5, v5
+; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0
+; GFX940-NEXT: v_not_b32_e32 v6, v4
; GFX940-NEXT: s_mov_b64 s[0:1], 0
-; GFX940-NEXT: .LBB23_1: ; %atomicrmw.start
+; GFX940-NEXT: .LBB45_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_mov_b32_e32 v7, v4
-; GFX940-NEXT: v_lshrrev_b32_e32 v4, v3, v7
+; GFX940-NEXT: v_lshrrev_b32_e32 v4, v3, v5
; GFX940-NEXT: v_add_f16_e32 v4, v4, v2
; GFX940-NEXT: v_lshlrev_b32_e32 v4, v3, v4
-; GFX940-NEXT: v_and_or_b32 v6, v7, v5, v4
-; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: flat_atomic_cmpswap v4, v[0:1], v[6:7] sc0
+; GFX940-NEXT: v_and_or_b32 v4, v5, v6, v4
+; GFX940-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7
+; GFX940-NEXT: buffer_inv sc0 sc1
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5
; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX940-NEXT: v_mov_b32_e32 v5, v4
; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT: s_cbranch_execnz .LBB23_1
+; GFX940-NEXT: s_cbranch_execnz .LBB45_1
; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX940-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v4
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: flat_agent_atomic_fadd_ret_f16__offset12b_pos:
+; GFX11-LABEL: flat_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
@@ -4619,39 +9516,37 @@ define half @flat_agent_atomic_fadd_ret_f16__offset12b_pos(ptr %ptr, half %val)
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_and_b32_e32 v0, -4, v3
; GFX11-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX11-NEXT: flat_load_b32 v5, v[0:1]
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3
-; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX11-NEXT: flat_load_b32 v4, v[0:1]
+; GFX11-NEXT: v_lshlrev_b32_e32 v5, 3, v3
+; GFX11-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_not_b32_e32 v4, v4
-; GFX11-NEXT: .LBB23_1: ; %atomicrmw.start
+; GFX11-NEXT: v_not_b32_e32 v6, v3
+; GFX11-NEXT: .LBB45_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v6, v5
+; GFX11-NEXT: v_lshrrev_b32_e32 v3, v5, v4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6
-; GFX11-NEXT: v_add_f16_e32 v5, v5, v2
+; GFX11-NEXT: v_add_f16_e32 v3, v3, v2
+; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX11-NEXT: v_lshlrev_b32_e32 v3, v5, v3
+; GFX11-NEXT: v_and_or_b32 v3, v4, v6, v3
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc
+; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-NEXT: v_mov_b32_e32 v4, v3
; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB23_1
+; GFX11-NEXT: s_cbranch_execnz .LBB45_1
; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
-; GFX10-LABEL: flat_agent_atomic_fadd_ret_f16__offset12b_pos:
+; GFX10-LABEL: flat_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
@@ -4659,66 +9554,66 @@ define half @flat_agent_atomic_fadd_ret_f16__offset12b_pos(ptr %ptr, half %val)
; GFX10-NEXT: s_mov_b32 s4, 0
; GFX10-NEXT: v_and_b32_e32 v0, -4, v3
; GFX10-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX10-NEXT: flat_load_dword v5, v[0:1]
-; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3
-; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX10-NEXT: v_not_b32_e32 v4, v4
-; GFX10-NEXT: .LBB23_1: ; %atomicrmw.start
+; GFX10-NEXT: flat_load_dword v4, v[0:1]
+; GFX10-NEXT: v_lshlrev_b32_e32 v5, 3, v3
+; GFX10-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX10-NEXT: v_not_b32_e32 v6, v3
+; GFX10-NEXT: .LBB45_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v6, v5
-; GFX10-NEXT: v_lshrrev_b32_e32 v5, v3, v6
-; GFX10-NEXT: v_add_f16_e32 v5, v5, v2
-; GFX10-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-; GFX10-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX10-NEXT: v_lshrrev_b32_e32 v3, v5, v4
+; GFX10-NEXT: v_add_f16_e32 v3, v3, v2
+; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX10-NEXT: v_and_or_b32 v3, v4, v6, v3
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
+; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX10-NEXT: v_mov_b32_e32 v4, v3
; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: s_cbranch_execnz .LBB23_1
+; GFX10-NEXT: s_cbranch_execnz .LBB45_1
; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: v_lshrrev_b32_e32 v0, v3, v5
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX90A-LABEL: flat_agent_atomic_fadd_ret_f16__offset12b_pos:
+; GFX90A-LABEL: flat_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0
; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX90A-NEXT: flat_load_dword v4, v[0:1]
+; GFX90A-NEXT: flat_load_dword v5, v[0:1]
; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3
; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3
; GFX90A-NEXT: s_mov_b32 s4, 0xffff
-; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v3, s4
-; GFX90A-NEXT: v_not_b32_e32 v5, v5
+; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4
+; GFX90A-NEXT: v_not_b32_e32 v6, v4
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: .LBB23_1: ; %atomicrmw.start
+; GFX90A-NEXT: .LBB45_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v7, v4
-; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v7
+; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v5
; GFX90A-NEXT: v_add_f16_e32 v4, v4, v2
; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4
-; GFX90A-NEXT: v_and_or_b32 v6, v7, v5, v4
-; GFX90A-NEXT: flat_atomic_cmpswap v4, v[0:1], v[6:7] glc
+; GFX90A-NEXT: v_and_or_b32 v4, v5, v6, v4
+; GFX90A-NEXT: buffer_wbl2
+; GFX90A-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: buffer_invl2
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v5, v4
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: s_cbranch_execnz .LBB23_1
+; GFX90A-NEXT: s_cbranch_execnz .LBB45_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v4
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
-; GFX908-LABEL: flat_agent_atomic_fadd_ret_f16__offset12b_pos:
+; GFX908-LABEL: flat_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0
@@ -4726,119 +9621,118 @@ define half @flat_agent_atomic_fadd_ret_f16__offset12b_pos(ptr %ptr, half %val)
; GFX908-NEXT: v_and_b32_e32 v0, -4, v3
; GFX908-NEXT: flat_load_dword v4, v[0:1]
; GFX908-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX908-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX908-NEXT: v_lshlrev_b32_e32 v5, 3, v3
; GFX908-NEXT: s_mov_b32 s4, 0xffff
-; GFX908-NEXT: v_lshlrev_b32_e64 v5, v3, s4
-; GFX908-NEXT: v_not_b32_e32 v5, v5
+; GFX908-NEXT: v_lshlrev_b32_e64 v3, v5, s4
+; GFX908-NEXT: v_not_b32_e32 v6, v3
; GFX908-NEXT: s_mov_b64 s[4:5], 0
-; GFX908-NEXT: .LBB23_1: ; %atomicrmw.start
+; GFX908-NEXT: .LBB45_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v7, v4
-; GFX908-NEXT: v_lshrrev_b32_e32 v4, v3, v7
-; GFX908-NEXT: v_add_f16_e32 v4, v4, v2
-; GFX908-NEXT: v_lshlrev_b32_e32 v4, v3, v4
-; GFX908-NEXT: v_and_or_b32 v6, v7, v5, v4
-; GFX908-NEXT: flat_atomic_cmpswap v4, v[0:1], v[6:7] glc
+; GFX908-NEXT: v_lshrrev_b32_e32 v3, v5, v4
+; GFX908-NEXT: v_add_f16_e32 v3, v3, v2
+; GFX908-NEXT: v_lshlrev_b32_e32 v3, v5, v3
+; GFX908-NEXT: v_and_or_b32 v3, v4, v6, v3
+; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT: v_mov_b32_e32 v4, v3
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT: s_cbranch_execnz .LBB23_1
+; GFX908-NEXT: s_cbranch_execnz .LBB45_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX908-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v4
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
-; GFX8-LABEL: flat_agent_atomic_fadd_ret_f16__offset12b_pos:
+; GFX8-LABEL: flat_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fe, v0
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX8-NEXT: flat_load_dword v5, v[0:1]
+; GFX8-NEXT: flat_load_dword v4, v[0:1]
; GFX8-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX8-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX8-NEXT: v_lshlrev_b32_e32 v5, 3, v3
; GFX8-NEXT: s_mov_b32 s4, 0xffff
-; GFX8-NEXT: v_lshlrev_b32_e64 v4, v3, s4
-; GFX8-NEXT: v_not_b32_e32 v4, v4
+; GFX8-NEXT: v_lshlrev_b32_e64 v3, v5, s4
+; GFX8-NEXT: v_not_b32_e32 v6, v3
; GFX8-NEXT: s_mov_b64 s[4:5], 0
-; GFX8-NEXT: .LBB23_1: ; %atomicrmw.start
+; GFX8-NEXT: .LBB45_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v6, v5
-; GFX8-NEXT: v_lshrrev_b32_e32 v5, v3, v6
-; GFX8-NEXT: v_add_f16_e32 v5, v5, v2
-; GFX8-NEXT: v_and_b32_e32 v7, v6, v4
-; GFX8-NEXT: v_lshlrev_b32_e32 v5, v3, v5
-; GFX8-NEXT: v_or_b32_e32 v5, v7, v5
-; GFX8-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
+; GFX8-NEXT: v_lshrrev_b32_e32 v3, v5, v4
+; GFX8-NEXT: v_add_f16_e32 v3, v3, v2
+; GFX8-NEXT: v_and_b32_e32 v7, v4, v6
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, v5, v3
+; GFX8-NEXT: v_or_b32_e32 v3, v7, v3
+; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT: v_mov_b32_e32 v4, v3
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT: s_cbranch_execnz .LBB23_1
+; GFX8-NEXT: s_cbranch_execnz .LBB45_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, v3, v5
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX7-LABEL: flat_agent_atomic_fadd_ret_f16__offset12b_pos:
+; GFX7-LABEL: flat_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_add_i32_e32 v3, vcc, 0x7fe, v0
+; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0
; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX7-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX7-NEXT: flat_load_dword v5, v[0:1]
-; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v2
-; GFX7-NEXT: v_and_b32_e32 v2, 3, v3
-; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2
+; GFX7-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX7-NEXT: flat_load_dword v3, v[0:1]
+; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4
; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v4
-; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v2
-; GFX7-NEXT: v_not_b32_e32 v4, v4
-; GFX7-NEXT: .LBB23_1: ; %atomicrmw.start
+; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2
+; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v4
+; GFX7-NEXT: v_not_b32_e32 v6, v2
+; GFX7-NEXT: .LBB45_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v6, v5
-; GFX7-NEXT: v_lshrrev_b32_e32 v5, v2, v6
-; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5
-; GFX7-NEXT: v_and_b32_e32 v7, v6, v4
-; GFX7-NEXT: v_add_f32_e32 v5, v5, v3
-; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5
-; GFX7-NEXT: v_lshlrev_b32_e32 v5, v2, v5
-; GFX7-NEXT: v_or_b32_e32 v5, v7, v5
-; GFX7-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
+; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GFX7-NEXT: v_and_b32_e32 v7, v3, v6
+; GFX7-NEXT: v_add_f32_e32 v2, v2, v5
+; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, v4, v2
+; GFX7-NEXT: v_or_b32_e32 v2, v7, v2
+; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT: v_mov_b32_e32 v3, v2
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT: s_cbranch_execnz .LBB23_1
+; GFX7-NEXT: s_cbranch_execnz .LBB45_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, v2, v5
-; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr half, ptr %ptr, i64 1023
- %result = atomicrmw fadd ptr %gep, half %val syncscope("agent") seq_cst
- ret half %result
+ %unused = atomicrmw fadd ptr %gep, half %val seq_cst, !amdgpu.no.fine.grained.memory !0
+ ret void
}
-define half @flat_agent_atomic_fadd_ret_f16__offset12b_neg(ptr %ptr, half %val) #0 {
-; GFX12-LABEL: flat_agent_atomic_fadd_ret_f16__offset12b_neg:
+; --------------------------------------------------------------------
+; bfloat
+; --------------------------------------------------------------------
+
+define bfloat @flat_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory(ptr %ptr, bfloat %val) #0 {
+; GFX12-LABEL: flat_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: s_wait_expcnt 0x0
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
-; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2
; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
; GFX12-NEXT: flat_load_b32 v5, v[0:1]
@@ -4846,15 +9740,23 @@ define half @flat_agent_atomic_fadd_ret_f16__offset12b_neg(ptr %ptr, half %val)
; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_not_b32_e32 v4, v4
-; GFX12-NEXT: .LBB24_1: ; %atomicrmw.start
+; GFX12-NEXT: .LBB46_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: v_mov_b32_e32 v6, v5
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6
-; GFX12-NEXT: v_add_f16_e32 v5, v5, v2
+; GFX12-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX12-NEXT: v_add_f32_e32 v5, v5, v2
+; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_lshrrev_b32_e32 v5, 16, v5
; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5
@@ -4866,55 +9768,60 @@ define half @flat_agent_atomic_fadd_ret_f16__offset12b_neg(ptr %ptr, half %val)
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB24_1
+; GFX12-NEXT: s_cbranch_execnz .LBB46_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
-; GFX940-LABEL: flat_agent_atomic_fadd_ret_f16__offset12b_neg:
+; GFX940-LABEL: flat_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: s_movk_i32 s0, 0xf800
-; GFX940-NEXT: s_mov_b32 s1, -1
-; GFX940-NEXT: v_lshl_add_u64 v[6:7], v[0:1], 0, s[0:1]
-; GFX940-NEXT: v_and_b32_e32 v0, -4, v6
-; GFX940-NEXT: v_mov_b32_e32 v1, v7
-; GFX940-NEXT: flat_load_dword v4, v[0:1]
-; GFX940-NEXT: v_and_b32_e32 v3, 3, v6
+; GFX940-NEXT: v_mov_b32_e32 v3, v0
+; GFX940-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX940-NEXT: flat_load_dword v5, v[0:1]
+; GFX940-NEXT: v_and_b32_e32 v3, 3, v3
; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3
; GFX940-NEXT: s_mov_b32 s0, 0xffff
-; GFX940-NEXT: v_lshlrev_b32_e64 v5, v3, s0
-; GFX940-NEXT: v_not_b32_e32 v5, v5
+; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0
+; GFX940-NEXT: v_not_b32_e32 v4, v4
; GFX940-NEXT: s_mov_b64 s[0:1], 0
-; GFX940-NEXT: .LBB24_1: ; %atomicrmw.start
+; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX940-NEXT: s_movk_i32 s2, 0x7fff
+; GFX940-NEXT: .LBB46_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_mov_b32_e32 v7, v4
-; GFX940-NEXT: v_lshrrev_b32_e32 v4, v3, v7
-; GFX940-NEXT: v_add_f16_e32 v4, v4, v2
-; GFX940-NEXT: v_lshlrev_b32_e32 v4, v3, v4
-; GFX940-NEXT: v_and_or_b32 v6, v7, v5, v4
+; GFX940-NEXT: v_mov_b32_e32 v7, v5
+; GFX940-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX940-NEXT: s_nop 0
+; GFX940-NEXT: v_add_f32_e32 v5, v5, v2
+; GFX940-NEXT: v_bfe_u32 v6, v5, 16, 1
+; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX940-NEXT: v_add3_u32 v6, v6, v5, s2
+; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX940-NEXT: s_nop 1
+; GFX940-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc
+; GFX940-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX940-NEXT: v_and_or_b32 v6, v7, v4, v5
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: flat_atomic_cmpswap v4, v[0:1], v[6:7] sc0
+; GFX940-NEXT: flat_atomic_cmpswap v5, v[0:1], v[6:7] sc0
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7
; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT: s_cbranch_execnz .LBB24_1
+; GFX940-NEXT: s_cbranch_execnz .LBB46_1
; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX940-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v4
+; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v5
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: flat_agent_atomic_fadd_ret_f16__offset12b_neg:
+; GFX11-LABEL: flat_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX11-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2
; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_and_b32_e32 v0, -4, v3
; GFX11-NEXT: v_and_b32_e32 v3, 3, v3
; GFX11-NEXT: flat_load_b32 v5, v[0:1]
@@ -4922,15 +9829,24 @@ define half @flat_agent_atomic_fadd_ret_f16__offset12b_neg(ptr %ptr, half %val)
; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_not_b32_e32 v4, v4
-; GFX11-NEXT: .LBB24_1: ; %atomicrmw.start
+; GFX11-NEXT: .p2align 6
+; GFX11-NEXT: .LBB46_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v6, v5
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6
-; GFX11-NEXT: v_add_f16_e32 v5, v5, v2
+; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX11-NEXT: v_add_f32_e32 v5, v5, v2
+; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v5
; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_and_or_b32 v5, v6, v4, v5
@@ -4943,17 +9859,17 @@ define half @flat_agent_atomic_fadd_ret_f16__offset12b_neg(ptr %ptr, half %val)
; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB24_1
+; GFX11-NEXT: s_cbranch_execnz .LBB46_1
; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
-; GFX10-LABEL: flat_agent_atomic_fadd_ret_f16__offset12b_neg:
+; GFX10-LABEL: flat_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
-; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX10-NEXT: v_mov_b32_e32 v3, v0
+; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX10-NEXT: s_mov_b32 s4, 0
; GFX10-NEXT: v_and_b32_e32 v0, -4, v3
; GFX10-NEXT: v_and_b32_e32 v3, 3, v3
@@ -4961,13 +9877,18 @@ define half @flat_agent_atomic_fadd_ret_f16__offset12b_neg(ptr %ptr, half %val)
; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3
; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
; GFX10-NEXT: v_not_b32_e32 v4, v4
-; GFX10-NEXT: .LBB24_1: ; %atomicrmw.start
+; GFX10-NEXT: .LBB46_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v6, v5
-; GFX10-NEXT: v_lshrrev_b32_e32 v5, v3, v6
-; GFX10-NEXT: v_add_f16_e32 v5, v5, v2
-; GFX10-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX10-NEXT: v_lshrrev_b32_sdwa v5, v3, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX10-NEXT: v_add_f32_e32 v5, v5, v2
+; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX10-NEXT: v_and_or_b32 v5, v6, v4, v5
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
@@ -4977,83 +9898,94 @@ define half @flat_agent_atomic_fadd_ret_f16__offset12b_neg(ptr %ptr, half %val)
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: s_cbranch_execnz .LBB24_1
+; GFX10-NEXT: s_cbranch_execnz .LBB46_1
; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: v_lshrrev_b32_e32 v0, v3, v5
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX90A-LABEL: flat_agent_atomic_fadd_ret_f16__offset12b_neg:
+; GFX90A-LABEL: flat_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
-; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX90A-NEXT: v_mov_b32_e32 v3, v0
; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX90A-NEXT: flat_load_dword v4, v[0:1]
+; GFX90A-NEXT: flat_load_dword v5, v[0:1]
; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3
; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3
; GFX90A-NEXT: s_mov_b32 s4, 0xffff
-; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v3, s4
-; GFX90A-NEXT: v_not_b32_e32 v5, v5
+; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4
+; GFX90A-NEXT: v_not_b32_e32 v4, v4
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: .LBB24_1: ; %atomicrmw.start
+; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX90A-NEXT: s_movk_i32 s6, 0x7fff
+; GFX90A-NEXT: .LBB46_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v7, v4
-; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v7
-; GFX90A-NEXT: v_add_f16_e32 v4, v4, v2
-; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4
-; GFX90A-NEXT: v_and_or_b32 v6, v7, v5, v4
-; GFX90A-NEXT: flat_atomic_cmpswap v4, v[0:1], v[6:7] glc
+; GFX90A-NEXT: v_mov_b32_e32 v7, v5
+; GFX90A-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX90A-NEXT: v_add_f32_e32 v5, v5, v2
+; GFX90A-NEXT: v_bfe_u32 v6, v5, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX90A-NEXT: v_add3_u32 v6, v6, v5, s6
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX90A-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc
+; GFX90A-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX90A-NEXT: v_and_or_b32 v6, v7, v4, v5
+; GFX90A-NEXT: flat_atomic_cmpswap v5, v[0:1], v[6:7] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: s_cbranch_execnz .LBB24_1
+; GFX90A-NEXT: s_cbranch_execnz .LBB46_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v4
+; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
-; GFX908-LABEL: flat_agent_atomic_fadd_ret_f16__offset12b_neg:
+; GFX908-LABEL: flat_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
-; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX908-NEXT: v_mov_b32_e32 v3, v0
; GFX908-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX908-NEXT: flat_load_dword v4, v[0:1]
+; GFX908-NEXT: flat_load_dword v5, v[0:1]
; GFX908-NEXT: v_and_b32_e32 v3, 3, v3
; GFX908-NEXT: v_lshlrev_b32_e32 v3, 3, v3
; GFX908-NEXT: s_mov_b32 s4, 0xffff
-; GFX908-NEXT: v_lshlrev_b32_e64 v5, v3, s4
-; GFX908-NEXT: v_not_b32_e32 v5, v5
+; GFX908-NEXT: v_lshlrev_b32_e64 v4, v3, s4
+; GFX908-NEXT: v_not_b32_e32 v4, v4
; GFX908-NEXT: s_mov_b64 s[4:5], 0
-; GFX908-NEXT: .LBB24_1: ; %atomicrmw.start
+; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX908-NEXT: s_movk_i32 s6, 0x7fff
+; GFX908-NEXT: .LBB46_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v7, v4
-; GFX908-NEXT: v_lshrrev_b32_e32 v4, v3, v7
-; GFX908-NEXT: v_add_f16_e32 v4, v4, v2
-; GFX908-NEXT: v_lshlrev_b32_e32 v4, v3, v4
-; GFX908-NEXT: v_and_or_b32 v6, v7, v5, v4
-; GFX908-NEXT: flat_atomic_cmpswap v4, v[0:1], v[6:7] glc
+; GFX908-NEXT: v_mov_b32_e32 v6, v5
+; GFX908-NEXT: v_lshrrev_b32_sdwa v5, v3, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX908-NEXT: v_add_f32_e32 v5, v5, v2
+; GFX908-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX908-NEXT: v_add3_u32 v7, v7, v5, s6
+; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX908-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc
+; GFX908-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX908-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX908-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT: s_cbranch_execnz .LBB24_1
+; GFX908-NEXT: s_cbranch_execnz .LBB46_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX908-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v4
+; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v5
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
-; GFX8-LABEL: flat_agent_atomic_fadd_ret_f16__offset12b_neg:
+; GFX8-LABEL: flat_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0xfffff800, v0
-; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc
+; GFX8-NEXT: v_mov_b32_e32 v3, v0
; GFX8-NEXT: v_and_b32_e32 v0, -4, v3
; GFX8-NEXT: flat_load_dword v5, v[0:1]
; GFX8-NEXT: v_and_b32_e32 v3, 3, v3
@@ -5062,14 +9994,21 @@ define half @flat_agent_atomic_fadd_ret_f16__offset12b_neg(ptr %ptr, half %val)
; GFX8-NEXT: v_lshlrev_b32_e64 v4, v3, s4
; GFX8-NEXT: v_not_b32_e32 v4, v4
; GFX8-NEXT: s_mov_b64 s[4:5], 0
-; GFX8-NEXT: .LBB24_1: ; %atomicrmw.start
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX8-NEXT: .LBB46_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v6, v5
-; GFX8-NEXT: v_lshrrev_b32_e32 v5, v3, v6
-; GFX8-NEXT: v_add_f16_e32 v5, v5, v2
+; GFX8-NEXT: v_lshrrev_b32_sdwa v5, v3, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_add_f32_e32 v5, v5, v2
+; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5
+; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8
+; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
; GFX8-NEXT: v_and_b32_e32 v7, v6, v4
-; GFX8-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX8-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX8-NEXT: v_or_b32_e32 v5, v7, v5
; GFX8-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -5077,36 +10016,35 @@ define half @flat_agent_atomic_fadd_ret_f16__offset12b_neg(ptr %ptr, half %val)
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT: s_cbranch_execnz .LBB24_1
+; GFX8-NEXT: s_cbranch_execnz .LBB46_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX8-NEXT: v_lshrrev_b32_e32 v0, v3, v5
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX7-LABEL: flat_agent_atomic_fadd_ret_f16__offset12b_neg:
+; GFX7-LABEL: flat_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_add_i32_e32 v3, vcc, 0xfffff800, v0
-; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc
+; GFX7-NEXT: v_mov_b32_e32 v3, v0
; GFX7-NEXT: v_and_b32_e32 v0, -4, v3
; GFX7-NEXT: flat_load_dword v5, v[0:1]
-; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v2
-; GFX7-NEXT: v_and_b32_e32 v2, 3, v3
-; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2
-; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v4
-; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v2
+; GFX7-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v3
+; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX7-NEXT: v_not_b32_e32 v4, v4
-; GFX7-NEXT: .LBB24_1: ; %atomicrmw.start
+; GFX7-NEXT: s_mov_b64 s[4:5], 0
+; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT: .LBB46_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v6, v5
-; GFX7-NEXT: v_lshrrev_b32_e32 v5, v2, v6
-; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5
+; GFX7-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX7-NEXT: v_add_f32_e32 v5, v5, v2
+; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5
; GFX7-NEXT: v_and_b32_e32 v7, v6, v4
-; GFX7-NEXT: v_add_f32_e32 v5, v5, v3
-; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5
-; GFX7-NEXT: v_lshlrev_b32_e32 v5, v2, v5
+; GFX7-NEXT: v_lshlrev_b32_e32 v5, v3, v5
; GFX7-NEXT: v_or_b32_e32 v5, v7, v5
; GFX7-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -5114,340 +10052,427 @@ define half @flat_agent_atomic_fadd_ret_f16__offset12b_neg(ptr %ptr, half %val)
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT: s_cbranch_execnz .LBB24_1
+; GFX7-NEXT: s_cbranch_execnz .LBB46_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, v2, v5
-; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
- %gep = getelementptr half, ptr %ptr, i64 -1024
- %result = atomicrmw fadd ptr %gep, half %val syncscope("agent") seq_cst
- ret half %result
- }
+ %result = atomicrmw fadd ptr %ptr, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+ ret bfloat %result
+}
-define void @flat_agent_atomic_fadd_noret_f16(ptr %ptr, half %val) #0 {
-; GFX12-LABEL: flat_agent_atomic_fadd_noret_f16:
+define bfloat @flat_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr %ptr, bfloat %val) #0 {
+; GFX12-LABEL: flat_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: s_wait_expcnt 0x0
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v3, v0
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX12-NEXT: flat_load_b32 v4, v[0:1]
-; GFX12-NEXT: v_lshlrev_b32_e32 v5, 3, v3
-; GFX12-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX12-NEXT: s_mov_b32 s0, 0
+; GFX12-NEXT: flat_load_b32 v5, v[0:1]
+; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_not_b32_e32 v6, v3
-; GFX12-NEXT: .LBB25_1: ; %atomicrmw.start
+; GFX12-NEXT: v_not_b32_e32 v4, v4
+; GFX12-NEXT: .LBB47_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_lshrrev_b32_e32 v3, v5, v4
+; GFX12-NEXT: v_mov_b32_e32 v6, v5
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_add_f16_e32 v3, v3, v2
-; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX12-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3
-; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3
+; GFX12-NEXT: v_add_f32_e32 v5, v5, v2
+; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN
+; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB25_1
+; GFX12-NEXT: s_cbranch_execnz .LBB47_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
-; GFX940-LABEL: flat_agent_atomic_fadd_noret_f16:
+; GFX940-LABEL: flat_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_mov_b32_e32 v3, v0
-; GFX940-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX940-NEXT: s_mov_b64 s[0:1], 0x7fe
+; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
+; GFX940-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX940-NEXT: v_mov_b32_e32 v1, v5
; GFX940-NEXT: flat_load_dword v5, v[0:1]
-; GFX940-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX940-NEXT: v_and_b32_e32 v3, 3, v4
; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3
; GFX940-NEXT: s_mov_b32 s0, 0xffff
; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0
-; GFX940-NEXT: v_not_b32_e32 v6, v4
+; GFX940-NEXT: v_not_b32_e32 v4, v4
; GFX940-NEXT: s_mov_b64 s[0:1], 0
-; GFX940-NEXT: .LBB25_1: ; %atomicrmw.start
+; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX940-NEXT: s_movk_i32 s2, 0x7fff
+; GFX940-NEXT: .LBB47_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_lshrrev_b32_e32 v4, v3, v5
-; GFX940-NEXT: v_add_f16_e32 v4, v4, v2
-; GFX940-NEXT: v_lshlrev_b32_e32 v4, v3, v4
-; GFX940-NEXT: v_and_or_b32 v4, v5, v6, v4
+; GFX940-NEXT: v_mov_b32_e32 v7, v5
+; GFX940-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX940-NEXT: s_nop 0
+; GFX940-NEXT: v_add_f32_e32 v5, v5, v2
+; GFX940-NEXT: v_bfe_u32 v6, v5, 16, 1
+; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX940-NEXT: v_add3_u32 v6, v6, v5, s2
+; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX940-NEXT: s_nop 1
+; GFX940-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc
+; GFX940-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX940-NEXT: v_and_or_b32 v6, v7, v4, v5
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] sc0
+; GFX940-NEXT: flat_atomic_cmpswap v5, v[0:1], v[6:7] sc0
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7
; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT: v_mov_b32_e32 v5, v4
; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT: s_cbranch_execnz .LBB25_1
+; GFX940-NEXT: s_cbranch_execnz .LBB47_1
; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX940-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v5
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: flat_agent_atomic_fadd_noret_f16:
+; GFX11-LABEL: flat_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v3, v0
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_and_b32_e32 v0, -4, v3
; GFX11-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX11-NEXT: flat_load_b32 v4, v[0:1]
-; GFX11-NEXT: v_lshlrev_b32_e32 v5, 3, v3
-; GFX11-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX11-NEXT: s_mov_b32 s0, 0
+; GFX11-NEXT: flat_load_b32 v5, v[0:1]
+; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_not_b32_e32 v6, v3
-; GFX11-NEXT: .LBB25_1: ; %atomicrmw.start
+; GFX11-NEXT: v_not_b32_e32 v4, v4
+; GFX11-NEXT: .p2align 6
+; GFX11-NEXT: .LBB47_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, v5, v4
+; GFX11-NEXT: v_mov_b32_e32 v6, v5
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_f16_e32 v3, v3, v2
-; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, v5, v3
-; GFX11-NEXT: v_and_or_b32 v3, v4, v6, v3
+; GFX11-NEXT: v_add_f32_e32 v5, v5, v2
+; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_and_or_b32 v5, v6, v4, v5
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc
+; GFX11-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB25_1
+; GFX11-NEXT: s_cbranch_execnz .LBB47_1
; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
-; GFX10-LABEL: flat_agent_atomic_fadd_noret_f16:
+; GFX10-LABEL: flat_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v3, v0
-; GFX10-NEXT: s_mov_b32 s4, 0
+; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX10-NEXT: v_and_b32_e32 v0, -4, v3
; GFX10-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX10-NEXT: flat_load_dword v4, v[0:1]
-; GFX10-NEXT: v_lshlrev_b32_e32 v5, 3, v3
-; GFX10-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
-; GFX10-NEXT: v_not_b32_e32 v6, v3
-; GFX10-NEXT: .LBB25_1: ; %atomicrmw.start
+; GFX10-NEXT: s_mov_b32 s4, 0
+; GFX10-NEXT: flat_load_dword v5, v[0:1]
+; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX10-NEXT: v_not_b32_e32 v4, v4
+; GFX10-NEXT: .LBB47_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_lshrrev_b32_e32 v3, v5, v4
-; GFX10-NEXT: v_add_f16_e32 v3, v3, v2
-; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-; GFX10-NEXT: v_and_or_b32 v3, v4, v6, v3
+; GFX10-NEXT: v_mov_b32_e32 v6, v5
+; GFX10-NEXT: v_lshrrev_b32_sdwa v5, v3, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX10-NEXT: v_add_f32_e32 v5, v5, v2
+; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX10-NEXT: v_and_or_b32 v5, v6, v4, v5
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX10-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX10-NEXT: v_mov_b32_e32 v4, v3
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: s_cbranch_execnz .LBB25_1
+; GFX10-NEXT: s_cbranch_execnz .LBB47_1
; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT: v_lshrrev_b32_e32 v0, v3, v5
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX90A-LABEL: flat_agent_atomic_fadd_noret_f16:
+; GFX90A-LABEL: flat_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v3, v0
+; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0
+; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3
; GFX90A-NEXT: flat_load_dword v5, v[0:1]
; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3
; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3
; GFX90A-NEXT: s_mov_b32 s4, 0xffff
; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4
-; GFX90A-NEXT: v_not_b32_e32 v6, v4
+; GFX90A-NEXT: v_not_b32_e32 v4, v4
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: .LBB25_1: ; %atomicrmw.start
+; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX90A-NEXT: s_movk_i32 s6, 0x7fff
+; GFX90A-NEXT: .LBB47_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v5
-; GFX90A-NEXT: v_add_f16_e32 v4, v4, v2
-; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4
-; GFX90A-NEXT: v_and_or_b32 v4, v5, v6, v4
-; GFX90A-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] glc
+; GFX90A-NEXT: v_mov_b32_e32 v7, v5
+; GFX90A-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX90A-NEXT: v_add_f32_e32 v5, v5, v2
+; GFX90A-NEXT: v_bfe_u32 v6, v5, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX90A-NEXT: v_add3_u32 v6, v6, v5, s6
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX90A-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc
+; GFX90A-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX90A-NEXT: v_and_or_b32 v6, v7, v4, v5
+; GFX90A-NEXT: flat_atomic_cmpswap v5, v[0:1], v[6:7] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: v_mov_b32_e32 v5, v4
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: s_cbranch_execnz .LBB25_1
+; GFX90A-NEXT: s_cbranch_execnz .LBB47_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
-; GFX908-LABEL: flat_agent_atomic_fadd_noret_f16:
+; GFX908-LABEL: flat_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v3, v0
+; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0
+; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX908-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX908-NEXT: flat_load_dword v4, v[0:1]
+; GFX908-NEXT: flat_load_dword v5, v[0:1]
; GFX908-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX908-NEXT: v_lshlrev_b32_e32 v5, 3, v3
+; GFX908-NEXT: v_lshlrev_b32_e32 v3, 3, v3
; GFX908-NEXT: s_mov_b32 s4, 0xffff
-; GFX908-NEXT: v_lshlrev_b32_e64 v3, v5, s4
-; GFX908-NEXT: v_not_b32_e32 v6, v3
+; GFX908-NEXT: v_lshlrev_b32_e64 v4, v3, s4
+; GFX908-NEXT: v_not_b32_e32 v4, v4
; GFX908-NEXT: s_mov_b64 s[4:5], 0
-; GFX908-NEXT: .LBB25_1: ; %atomicrmw.start
+; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX908-NEXT: s_movk_i32 s6, 0x7fff
+; GFX908-NEXT: .LBB47_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_lshrrev_b32_e32 v3, v5, v4
-; GFX908-NEXT: v_add_f16_e32 v3, v3, v2
-; GFX908-NEXT: v_lshlrev_b32_e32 v3, v5, v3
-; GFX908-NEXT: v_and_or_b32 v3, v4, v6, v3
-; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX908-NEXT: v_mov_b32_e32 v6, v5
+; GFX908-NEXT: v_lshrrev_b32_sdwa v5, v3, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX908-NEXT: v_add_f32_e32 v5, v5, v2
+; GFX908-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX908-NEXT: v_add3_u32 v7, v7, v5, s6
+; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX908-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc
+; GFX908-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX908-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX908-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT: v_mov_b32_e32 v4, v3
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT: s_cbranch_execnz .LBB25_1
+; GFX908-NEXT: s_cbranch_execnz .LBB47_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX908-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v5
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
-; GFX8-LABEL: flat_agent_atomic_fadd_noret_f16:
+; GFX8-LABEL: flat_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v3, v0
+; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fe, v0
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX8-NEXT: flat_load_dword v4, v[0:1]
+; GFX8-NEXT: flat_load_dword v5, v[0:1]
; GFX8-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX8-NEXT: v_lshlrev_b32_e32 v5, 3, v3
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 3, v3
; GFX8-NEXT: s_mov_b32 s4, 0xffff
-; GFX8-NEXT: v_lshlrev_b32_e64 v3, v5, s4
-; GFX8-NEXT: v_not_b32_e32 v6, v3
+; GFX8-NEXT: v_lshlrev_b32_e64 v4, v3, s4
+; GFX8-NEXT: v_not_b32_e32 v4, v4
; GFX8-NEXT: s_mov_b64 s[4:5], 0
-; GFX8-NEXT: .LBB25_1: ; %atomicrmw.start
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX8-NEXT: .LBB47_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_lshrrev_b32_e32 v3, v5, v4
-; GFX8-NEXT: v_add_f16_e32 v3, v3, v2
-; GFX8-NEXT: v_and_b32_e32 v7, v4, v6
-; GFX8-NEXT: v_lshlrev_b32_e32 v3, v5, v3
-; GFX8-NEXT: v_or_b32_e32 v3, v7, v3
-; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT: v_mov_b32_e32 v6, v5
+; GFX8-NEXT: v_lshrrev_b32_sdwa v5, v3, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_add_f32_e32 v5, v5, v2
+; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5
+; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8
+; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX8-NEXT: v_and_b32_e32 v7, v6, v4
+; GFX8-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT: v_or_b32_e32 v5, v7, v5
+; GFX8-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT: v_mov_b32_e32 v4, v3
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT: s_cbranch_execnz .LBB25_1
+; GFX8-NEXT: s_cbranch_execnz .LBB47_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT: v_lshrrev_b32_e32 v0, v3, v5
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX7-LABEL: flat_agent_atomic_fadd_noret_f16:
+; GFX7-LABEL: flat_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v3, v0
+; GFX7-NEXT: v_add_i32_e32 v3, vcc, 0x7fe, v0
+; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX7-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX7-NEXT: flat_load_dword v4, v[0:1]
-; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v2
-; GFX7-NEXT: v_and_b32_e32 v2, 3, v3
-; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2
-; GFX7-NEXT: v_lshl_b32_e32 v3, 0xffff, v2
-; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5
-; GFX7-NEXT: v_not_b32_e32 v6, v3
+; GFX7-NEXT: flat_load_dword v5, v[0:1]
+; GFX7-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v3
+; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT: v_not_b32_e32 v4, v4
; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: .LBB25_1: ; %atomicrmw.start
+; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT: .LBB47_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v3, v2, v4
-; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3
-; GFX7-NEXT: v_and_b32_e32 v7, v4, v6
-; GFX7-NEXT: v_add_f32_e32 v3, v3, v5
-; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3
-; GFX7-NEXT: v_lshlrev_b32_e32 v3, v2, v3
-; GFX7-NEXT: v_or_b32_e32 v3, v7, v3
-; GFX7-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX7-NEXT: v_mov_b32_e32 v6, v5
+; GFX7-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX7-NEXT: v_add_f32_e32 v5, v5, v2
+; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX7-NEXT: v_and_b32_e32 v7, v6, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX7-NEXT: v_or_b32_e32 v5, v7, v5
+; GFX7-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT: v_mov_b32_e32 v4, v3
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT: s_cbranch_execnz .LBB25_1
+; GFX7-NEXT: s_cbranch_execnz .LBB47_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
- %unused = atomicrmw fadd ptr %ptr, half %val syncscope("agent") seq_cst
- ret void
+ %gep = getelementptr bfloat, ptr %ptr, i64 1023
+ %result = atomicrmw fadd ptr %gep, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+ ret bfloat %result
}
-define void @flat_agent_atomic_fadd_noret_f16__offset12b_pos(ptr %ptr, half %val) #0 {
-; GFX12-LABEL: flat_agent_atomic_fadd_noret_f16__offset12b_pos:
+define bfloat @flat_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr %ptr, bfloat %val) #0 {
+; GFX12-LABEL: flat_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: s_wait_expcnt 0x0
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
-; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
+; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX12-NEXT: flat_load_b32 v4, v[0:1]
-; GFX12-NEXT: v_lshlrev_b32_e32 v5, 3, v3
-; GFX12-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX12-NEXT: s_mov_b32 s0, 0
+; GFX12-NEXT: flat_load_b32 v5, v[0:1]
+; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_not_b32_e32 v6, v3
-; GFX12-NEXT: .LBB26_1: ; %atomicrmw.start
+; GFX12-NEXT: v_not_b32_e32 v4, v4
+; GFX12-NEXT: .LBB48_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_lshrrev_b32_e32 v3, v5, v4
+; GFX12-NEXT: v_mov_b32_e32 v6, v5
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_add_f16_e32 v3, v3, v2
-; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX12-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3
-; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3
+; GFX12-NEXT: v_add_f32_e32 v5, v5, v2
+; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN
+; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB26_1
+; GFX12-NEXT: s_cbranch_execnz .LBB48_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
-; GFX940-LABEL: flat_agent_atomic_fadd_noret_f16__offset12b_pos:
+; GFX940-LABEL: flat_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: s_mov_b64 s[0:1], 0x7fe
+; GFX940-NEXT: s_movk_i32 s0, 0xf800
+; GFX940-NEXT: s_mov_b32 s1, -1
; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
; GFX940-NEXT: v_and_b32_e32 v0, -4, v4
; GFX940-NEXT: v_mov_b32_e32 v1, v5
@@ -5456,506 +10481,615 @@ define void @flat_agent_atomic_fadd_noret_f16__offset12b_pos(ptr %ptr, half %val
; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3
; GFX940-NEXT: s_mov_b32 s0, 0xffff
; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0
-; GFX940-NEXT: v_not_b32_e32 v6, v4
+; GFX940-NEXT: v_not_b32_e32 v4, v4
; GFX940-NEXT: s_mov_b64 s[0:1], 0
-; GFX940-NEXT: .LBB26_1: ; %atomicrmw.start
+; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX940-NEXT: s_movk_i32 s2, 0x7fff
+; GFX940-NEXT: .LBB48_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_lshrrev_b32_e32 v4, v3, v5
-; GFX940-NEXT: v_add_f16_e32 v4, v4, v2
-; GFX940-NEXT: v_lshlrev_b32_e32 v4, v3, v4
-; GFX940-NEXT: v_and_or_b32 v4, v5, v6, v4
+; GFX940-NEXT: v_mov_b32_e32 v7, v5
+; GFX940-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX940-NEXT: s_nop 0
+; GFX940-NEXT: v_add_f32_e32 v5, v5, v2
+; GFX940-NEXT: v_bfe_u32 v6, v5, 16, 1
+; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX940-NEXT: v_add3_u32 v6, v6, v5, s2
+; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX940-NEXT: s_nop 1
+; GFX940-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc
+; GFX940-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX940-NEXT: v_and_or_b32 v6, v7, v4, v5
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] sc0
+; GFX940-NEXT: flat_atomic_cmpswap v5, v[0:1], v[6:7] sc0
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7
; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT: v_mov_b32_e32 v5, v4
; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT: s_cbranch_execnz .LBB26_1
+; GFX940-NEXT: s_cbranch_execnz .LBB48_1
; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX940-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v5
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: flat_agent_atomic_fadd_noret_f16__offset12b_pos:
+; GFX11-LABEL: flat_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
+; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_and_b32_e32 v0, -4, v3
; GFX11-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX11-NEXT: flat_load_b32 v4, v[0:1]
-; GFX11-NEXT: v_lshlrev_b32_e32 v5, 3, v3
-; GFX11-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX11-NEXT: s_mov_b32 s0, 0
+; GFX11-NEXT: flat_load_b32 v5, v[0:1]
+; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_not_b32_e32 v6, v3
-; GFX11-NEXT: .LBB26_1: ; %atomicrmw.start
+; GFX11-NEXT: v_not_b32_e32 v4, v4
+; GFX11-NEXT: .p2align 6
+; GFX11-NEXT: .LBB48_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, v5, v4
+; GFX11-NEXT: v_mov_b32_e32 v6, v5
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_f16_e32 v3, v3, v2
-; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, v5, v3
-; GFX11-NEXT: v_and_or_b32 v3, v4, v6, v3
+; GFX11-NEXT: v_add_f32_e32 v5, v5, v2
+; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_and_or_b32 v5, v6, v4, v5
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc
+; GFX11-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB26_1
+; GFX11-NEXT: s_cbranch_execnz .LBB48_1
; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
-; GFX10-LABEL: flat_agent_atomic_fadd_noret_f16__offset12b_pos:
+; GFX10-LABEL: flat_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
-; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
-; GFX10-NEXT: s_mov_b32 s4, 0
+; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
+; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX10-NEXT: v_and_b32_e32 v0, -4, v3
; GFX10-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX10-NEXT: flat_load_dword v4, v[0:1]
-; GFX10-NEXT: v_lshlrev_b32_e32 v5, 3, v3
-; GFX10-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
-; GFX10-NEXT: v_not_b32_e32 v6, v3
-; GFX10-NEXT: .LBB26_1: ; %atomicrmw.start
+; GFX10-NEXT: s_mov_b32 s4, 0
+; GFX10-NEXT: flat_load_dword v5, v[0:1]
+; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX10-NEXT: v_not_b32_e32 v4, v4
+; GFX10-NEXT: .LBB48_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_lshrrev_b32_e32 v3, v5, v4
-; GFX10-NEXT: v_add_f16_e32 v3, v3, v2
-; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-; GFX10-NEXT: v_and_or_b32 v3, v4, v6, v3
+; GFX10-NEXT: v_mov_b32_e32 v6, v5
+; GFX10-NEXT: v_lshrrev_b32_sdwa v5, v3, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX10-NEXT: v_add_f32_e32 v5, v5, v2
+; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX10-NEXT: v_and_or_b32 v5, v6, v4, v5
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX10-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX10-NEXT: v_mov_b32_e32 v4, v3
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: s_cbranch_execnz .LBB26_1
+; GFX10-NEXT: s_cbranch_execnz .LBB48_1
; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT: v_lshrrev_b32_e32 v0, v3, v5
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX90A-LABEL: flat_agent_atomic_fadd_noret_f16__offset12b_pos:
+; GFX90A-LABEL: flat_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0
-; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
+; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3
; GFX90A-NEXT: flat_load_dword v5, v[0:1]
; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3
; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3
; GFX90A-NEXT: s_mov_b32 s4, 0xffff
; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4
-; GFX90A-NEXT: v_not_b32_e32 v6, v4
+; GFX90A-NEXT: v_not_b32_e32 v4, v4
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: .LBB26_1: ; %atomicrmw.start
+; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX90A-NEXT: s_movk_i32 s6, 0x7fff
+; GFX90A-NEXT: .LBB48_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v5
-; GFX90A-NEXT: v_add_f16_e32 v4, v4, v2
-; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4
-; GFX90A-NEXT: v_and_or_b32 v4, v5, v6, v4
-; GFX90A-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] glc
+; GFX90A-NEXT: v_mov_b32_e32 v7, v5
+; GFX90A-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX90A-NEXT: v_add_f32_e32 v5, v5, v2
+; GFX90A-NEXT: v_bfe_u32 v6, v5, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX90A-NEXT: v_add3_u32 v6, v6, v5, s6
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX90A-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc
+; GFX90A-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX90A-NEXT: v_and_or_b32 v6, v7, v4, v5
+; GFX90A-NEXT: flat_atomic_cmpswap v5, v[0:1], v[6:7] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: v_mov_b32_e32 v5, v4
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: s_cbranch_execnz .LBB26_1
+; GFX90A-NEXT: s_cbranch_execnz .LBB48_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
-; GFX908-LABEL: flat_agent_atomic_fadd_noret_f16__offset12b_pos:
+; GFX908-LABEL: flat_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0
-; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
+; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
; GFX908-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX908-NEXT: flat_load_dword v4, v[0:1]
+; GFX908-NEXT: flat_load_dword v5, v[0:1]
; GFX908-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX908-NEXT: v_lshlrev_b32_e32 v5, 3, v3
+; GFX908-NEXT: v_lshlrev_b32_e32 v3, 3, v3
; GFX908-NEXT: s_mov_b32 s4, 0xffff
-; GFX908-NEXT: v_lshlrev_b32_e64 v3, v5, s4
-; GFX908-NEXT: v_not_b32_e32 v6, v3
+; GFX908-NEXT: v_lshlrev_b32_e64 v4, v3, s4
+; GFX908-NEXT: v_not_b32_e32 v4, v4
; GFX908-NEXT: s_mov_b64 s[4:5], 0
-; GFX908-NEXT: .LBB26_1: ; %atomicrmw.start
+; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX908-NEXT: s_movk_i32 s6, 0x7fff
+; GFX908-NEXT: .LBB48_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_lshrrev_b32_e32 v3, v5, v4
-; GFX908-NEXT: v_add_f16_e32 v3, v3, v2
-; GFX908-NEXT: v_lshlrev_b32_e32 v3, v5, v3
-; GFX908-NEXT: v_and_or_b32 v3, v4, v6, v3
-; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX908-NEXT: v_mov_b32_e32 v6, v5
+; GFX908-NEXT: v_lshrrev_b32_sdwa v5, v3, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX908-NEXT: v_add_f32_e32 v5, v5, v2
+; GFX908-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX908-NEXT: v_add3_u32 v7, v7, v5, s6
+; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX908-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc
+; GFX908-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX908-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX908-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT: v_mov_b32_e32 v4, v3
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT: s_cbranch_execnz .LBB26_1
+; GFX908-NEXT: s_cbranch_execnz .LBB48_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX908-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v5
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
-; GFX8-LABEL: flat_agent_atomic_fadd_noret_f16__offset12b_pos:
+; GFX8-LABEL: flat_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fe, v0
-; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0xfffff800, v0
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc
; GFX8-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX8-NEXT: flat_load_dword v4, v[0:1]
+; GFX8-NEXT: flat_load_dword v5, v[0:1]
; GFX8-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX8-NEXT: v_lshlrev_b32_e32 v5, 3, v3
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 3, v3
; GFX8-NEXT: s_mov_b32 s4, 0xffff
-; GFX8-NEXT: v_lshlrev_b32_e64 v3, v5, s4
-; GFX8-NEXT: v_not_b32_e32 v6, v3
+; GFX8-NEXT: v_lshlrev_b32_e64 v4, v3, s4
+; GFX8-NEXT: v_not_b32_e32 v4, v4
; GFX8-NEXT: s_mov_b64 s[4:5], 0
-; GFX8-NEXT: .LBB26_1: ; %atomicrmw.start
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX8-NEXT: .LBB48_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_lshrrev_b32_e32 v3, v5, v4
-; GFX8-NEXT: v_add_f16_e32 v3, v3, v2
-; GFX8-NEXT: v_and_b32_e32 v7, v4, v6
-; GFX8-NEXT: v_lshlrev_b32_e32 v3, v5, v3
-; GFX8-NEXT: v_or_b32_e32 v3, v7, v3
-; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT: v_mov_b32_e32 v6, v5
+; GFX8-NEXT: v_lshrrev_b32_sdwa v5, v3, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_add_f32_e32 v5, v5, v2
+; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5
+; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8
+; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX8-NEXT: v_and_b32_e32 v7, v6, v4
+; GFX8-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT: v_or_b32_e32 v5, v7, v5
+; GFX8-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT: v_mov_b32_e32 v4, v3
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT: s_cbranch_execnz .LBB26_1
+; GFX8-NEXT: s_cbranch_execnz .LBB48_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT: v_lshrrev_b32_e32 v0, v3, v5
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX7-LABEL: flat_agent_atomic_fadd_noret_f16__offset12b_pos:
+; GFX7-LABEL: flat_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0
-; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX7-NEXT: v_and_b32_e32 v0, -4, v4
-; GFX7-NEXT: flat_load_dword v3, v[0:1]
-; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX7-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX7-NEXT: v_add_i32_e32 v3, vcc, 0xfffff800, v0
+; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc
+; GFX7-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX7-NEXT: flat_load_dword v5, v[0:1]
+; GFX7-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v3
+; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT: v_not_b32_e32 v4, v4
; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2
-; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v4
-; GFX7-NEXT: v_not_b32_e32 v6, v2
-; GFX7-NEXT: .LBB26_1: ; %atomicrmw.start
+; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT: .LBB48_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3
-; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
-; GFX7-NEXT: v_and_b32_e32 v7, v3, v6
-; GFX7-NEXT: v_add_f32_e32 v2, v2, v5
-; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX7-NEXT: v_lshlrev_b32_e32 v2, v4, v2
-; GFX7-NEXT: v_or_b32_e32 v2, v7, v2
-; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX7-NEXT: v_mov_b32_e32 v6, v5
+; GFX7-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX7-NEXT: v_add_f32_e32 v5, v5, v2
+; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX7-NEXT: v_and_b32_e32 v7, v6, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX7-NEXT: v_or_b32_e32 v5, v7, v5
+; GFX7-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT: v_mov_b32_e32 v3, v2
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT: s_cbranch_execnz .LBB26_1
+; GFX7-NEXT: s_cbranch_execnz .LBB48_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
- %gep = getelementptr half, ptr %ptr, i64 1023
- %unused = atomicrmw fadd ptr %gep, half %val syncscope("agent") seq_cst
- ret void
-}
+ %gep = getelementptr bfloat, ptr %ptr, i64 -1024
+ %result = atomicrmw fadd ptr %gep, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+ ret bfloat %result
+ }
-define void @flat_agent_atomic_fadd_noret_f16__offset12b_neg(ptr %ptr, half %val) #0 {
-; GFX12-LABEL: flat_agent_atomic_fadd_noret_f16__offset12b_neg:
+define void @flat_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr %ptr, bfloat %val) #0 {
+; GFX12-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: s_wait_expcnt 0x0
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
-; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
+; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX12-NEXT: v_lshlrev_b32_e32 v6, 16, v2
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX12-NEXT: v_and_b32_e32 v4, 3, v4
; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX12-NEXT: flat_load_b32 v4, v[0:1]
-; GFX12-NEXT: v_lshlrev_b32_e32 v5, 3, v3
-; GFX12-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX12-NEXT: flat_load_b32 v3, v[0:1]
+; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_not_b32_e32 v6, v3
-; GFX12-NEXT: .LBB27_1: ; %atomicrmw.start
+; GFX12-NEXT: v_not_b32_e32 v5, v5
+; GFX12-NEXT: .LBB49_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_lshrrev_b32_e32 v3, v5, v4
+; GFX12-NEXT: v_lshrrev_b32_e32 v2, v4, v3
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_add_f16_e32 v3, v3, v2
-; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX12-NEXT: v_add_f32_e32 v2, v2, v6
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3
-; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3
+; GFX12-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo
+; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2
+; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN
+; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-NEXT: v_mov_b32_e32 v3, v2
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB27_1
+; GFX12-NEXT: s_cbranch_execnz .LBB49_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
-; GFX940-LABEL: flat_agent_atomic_fadd_noret_f16__offset12b_neg:
+; GFX940-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: s_movk_i32 s0, 0xf800
-; GFX940-NEXT: s_mov_b32 s1, -1
+; GFX940-NEXT: s_mov_b64 s[0:1], 0x7fe
; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
; GFX940-NEXT: v_and_b32_e32 v0, -4, v4
; GFX940-NEXT: v_mov_b32_e32 v1, v5
-; GFX940-NEXT: flat_load_dword v5, v[0:1]
-; GFX940-NEXT: v_and_b32_e32 v3, 3, v4
-; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX940-NEXT: flat_load_dword v3, v[0:1]
+; GFX940-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX940-NEXT: v_lshlrev_b32_e32 v4, 3, v4
; GFX940-NEXT: s_mov_b32 s0, 0xffff
-; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0
-; GFX940-NEXT: v_not_b32_e32 v6, v4
+; GFX940-NEXT: v_lshlrev_b32_e64 v5, v4, s0
+; GFX940-NEXT: v_not_b32_e32 v5, v5
; GFX940-NEXT: s_mov_b64 s[0:1], 0
-; GFX940-NEXT: .LBB27_1: ; %atomicrmw.start
+; GFX940-NEXT: v_lshlrev_b32_e32 v6, 16, v2
+; GFX940-NEXT: s_movk_i32 s2, 0x7fff
+; GFX940-NEXT: .LBB49_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_lshrrev_b32_e32 v4, v3, v5
-; GFX940-NEXT: v_add_f16_e32 v4, v4, v2
-; GFX940-NEXT: v_lshlrev_b32_e32 v4, v3, v4
-; GFX940-NEXT: v_and_or_b32 v4, v5, v6, v4
+; GFX940-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX940-NEXT: s_nop 0
+; GFX940-NEXT: v_add_f32_e32 v2, v2, v6
+; GFX940-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX940-NEXT: v_add3_u32 v7, v7, v2, s2
+; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; GFX940-NEXT: s_nop 1
+; GFX940-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc
+; GFX940-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX940-NEXT: v_and_or_b32 v2, v3, v5, v2
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] sc0
+; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT: v_mov_b32_e32 v5, v4
+; GFX940-NEXT: v_mov_b32_e32 v3, v2
; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT: s_cbranch_execnz .LBB27_1
+; GFX940-NEXT: s_cbranch_execnz .LBB49_1
; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX940-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: flat_agent_atomic_fadd_noret_f16__offset12b_neg:
+; GFX11-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
+; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX11-NEXT: v_and_b32_e32 v4, 3, v4
; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX11-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX11-NEXT: flat_load_b32 v4, v[0:1]
-; GFX11-NEXT: v_lshlrev_b32_e32 v5, 3, v3
-; GFX11-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX11-NEXT: flat_load_b32 v3, v[0:1]
+; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX11-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_not_b32_e32 v6, v3
-; GFX11-NEXT: .LBB27_1: ; %atomicrmw.start
+; GFX11-NEXT: v_not_b32_e32 v5, v5
+; GFX11-NEXT: .p2align 6
+; GFX11-NEXT: .LBB49_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, v5, v4
+; GFX11-NEXT: v_lshrrev_b32_e32 v2, v4, v3
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_f16_e32 v3, v3, v2
-; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-NEXT: v_add_f32_e32 v2, v2, v6
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, v5, v3
-; GFX11-NEXT: v_and_or_b32 v3, v4, v6, v3
+; GFX11-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo
+; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_lshlrev_b32_e32 v2, v4, v2
+; GFX11-NEXT: v_and_or_b32 v2, v3, v5, v2
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc
+; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-NEXT: v_mov_b32_e32 v3, v2
; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB27_1
+; GFX11-NEXT: s_cbranch_execnz .LBB49_1
; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
-; GFX10-LABEL: flat_agent_atomic_fadd_noret_f16__offset12b_neg:
+; GFX10-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
-; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
+; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v2
+; GFX10-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX10-NEXT: v_and_b32_e32 v4, 3, v4
; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX10-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX10-NEXT: flat_load_dword v4, v[0:1]
-; GFX10-NEXT: v_lshlrev_b32_e32 v5, 3, v3
-; GFX10-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
-; GFX10-NEXT: v_not_b32_e32 v6, v3
-; GFX10-NEXT: .LBB27_1: ; %atomicrmw.start
+; GFX10-NEXT: flat_load_dword v3, v[0:1]
+; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX10-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
+; GFX10-NEXT: v_not_b32_e32 v5, v5
+; GFX10-NEXT: .LBB49_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_lshrrev_b32_e32 v3, v5, v4
-; GFX10-NEXT: v_add_f16_e32 v3, v3, v2
-; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-; GFX10-NEXT: v_and_or_b32 v3, v4, v6, v3
+; GFX10-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX10-NEXT: v_add_f32_e32 v2, v2, v6
+; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX10-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX10-NEXT: v_and_or_b32 v2, v3, v5, v2
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX10-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX10-NEXT: v_mov_b32_e32 v4, v3
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-NEXT: v_mov_b32_e32 v3, v2
; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: s_cbranch_execnz .LBB27_1
+; GFX10-NEXT: s_cbranch_execnz .LBB49_1
; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX90A-LABEL: flat_agent_atomic_fadd_noret_f16__offset12b_neg:
-; GFX90A: ; %bb.0:
-; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
-; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
-; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX90A-NEXT: flat_load_dword v5, v[0:1]
-; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX90A: ; %bb.0:
+; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0
+; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX90A-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX90A-NEXT: flat_load_dword v3, v[0:1]
+; GFX90A-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 3, v4
; GFX90A-NEXT: s_mov_b32 s4, 0xffff
-; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4
-; GFX90A-NEXT: v_not_b32_e32 v6, v4
+; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v4, s4
+; GFX90A-NEXT: v_not_b32_e32 v5, v5
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: .LBB27_1: ; %atomicrmw.start
+; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v2
+; GFX90A-NEXT: s_movk_i32 s6, 0x7fff
+; GFX90A-NEXT: .LBB49_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v5
-; GFX90A-NEXT: v_add_f16_e32 v4, v4, v2
-; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4
-; GFX90A-NEXT: v_and_or_b32 v4, v5, v6, v4
-; GFX90A-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] glc
+; GFX90A-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX90A-NEXT: v_add_f32_e32 v2, v2, v6
+; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s6
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; GFX90A-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc
+; GFX90A-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX90A-NEXT: v_and_or_b32 v2, v3, v5, v2
+; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: v_mov_b32_e32 v5, v4
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: s_cbranch_execnz .LBB27_1
+; GFX90A-NEXT: s_cbranch_execnz .LBB49_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
-; GFX908-LABEL: flat_agent_atomic_fadd_noret_f16__offset12b_neg:
+; GFX908-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
-; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
-; GFX908-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX908-NEXT: flat_load_dword v4, v[0:1]
-; GFX908-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX908-NEXT: v_lshlrev_b32_e32 v5, 3, v3
+; GFX908-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0
+; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX908-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX908-NEXT: flat_load_dword v3, v[0:1]
+; GFX908-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX908-NEXT: v_lshlrev_b32_e32 v4, 3, v4
; GFX908-NEXT: s_mov_b32 s4, 0xffff
-; GFX908-NEXT: v_lshlrev_b32_e64 v3, v5, s4
-; GFX908-NEXT: v_not_b32_e32 v6, v3
+; GFX908-NEXT: v_lshlrev_b32_e64 v5, v4, s4
+; GFX908-NEXT: v_not_b32_e32 v5, v5
; GFX908-NEXT: s_mov_b64 s[4:5], 0
-; GFX908-NEXT: .LBB27_1: ; %atomicrmw.start
+; GFX908-NEXT: v_lshlrev_b32_e32 v6, 16, v2
+; GFX908-NEXT: s_movk_i32 s6, 0x7fff
+; GFX908-NEXT: .LBB49_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_lshrrev_b32_e32 v3, v5, v4
-; GFX908-NEXT: v_add_f16_e32 v3, v3, v2
-; GFX908-NEXT: v_lshlrev_b32_e32 v3, v5, v3
-; GFX908-NEXT: v_and_or_b32 v3, v4, v6, v3
-; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX908-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX908-NEXT: v_add_f32_e32 v2, v2, v6
+; GFX908-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX908-NEXT: v_add3_u32 v7, v7, v2, s6
+; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; GFX908-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc
+; GFX908-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX908-NEXT: v_and_or_b32 v2, v3, v5, v2
+; GFX908-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT: v_mov_b32_e32 v4, v3
+; GFX908-NEXT: v_mov_b32_e32 v3, v2
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT: s_cbranch_execnz .LBB27_1
+; GFX908-NEXT: s_cbranch_execnz .LBB49_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX908-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
-; GFX8-LABEL: flat_agent_atomic_fadd_noret_f16__offset12b_neg:
+; GFX8-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0xfffff800, v0
-; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc
-; GFX8-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX8-NEXT: flat_load_dword v4, v[0:1]
-; GFX8-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX8-NEXT: v_lshlrev_b32_e32 v5, 3, v3
+; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fe, v0
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX8-NEXT: flat_load_dword v3, v[0:1]
+; GFX8-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX8-NEXT: v_lshlrev_b32_e32 v4, 3, v4
; GFX8-NEXT: s_mov_b32 s4, 0xffff
-; GFX8-NEXT: v_lshlrev_b32_e64 v3, v5, s4
-; GFX8-NEXT: v_not_b32_e32 v6, v3
+; GFX8-NEXT: v_lshlrev_b32_e64 v5, v4, s4
+; GFX8-NEXT: v_not_b32_e32 v5, v5
; GFX8-NEXT: s_mov_b64 s[4:5], 0
-; GFX8-NEXT: .LBB27_1: ; %atomicrmw.start
+; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v2
+; GFX8-NEXT: .LBB49_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_lshrrev_b32_e32 v3, v5, v4
-; GFX8-NEXT: v_add_f16_e32 v3, v3, v2
-; GFX8-NEXT: v_and_b32_e32 v7, v4, v6
-; GFX8-NEXT: v_lshlrev_b32_e32 v3, v5, v3
-; GFX8-NEXT: v_or_b32_e32 v3, v7, v3
-; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_add_f32_e32 v2, v2, v6
+; GFX8-NEXT: v_bfe_u32 v8, v2, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v2
+; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8
+; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; GFX8-NEXT: v_cndmask_b32_e32 v2, v8, v9, vcc
+; GFX8-NEXT: v_and_b32_e32 v7, v3, v5
+; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT: v_or_b32_e32 v2, v7, v2
+; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT: v_mov_b32_e32 v4, v3
+; GFX8-NEXT: v_mov_b32_e32 v3, v2
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT: s_cbranch_execnz .LBB27_1
+; GFX8-NEXT: s_cbranch_execnz .LBB49_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX7-LABEL: flat_agent_atomic_fadd_noret_f16__offset12b_neg:
+; GFX7-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0xfffff800, v0
-; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc
+; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0
+; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX7-NEXT: v_and_b32_e32 v0, -4, v4
; GFX7-NEXT: flat_load_dword v3, v[0:1]
-; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX7-NEXT: v_and_b32_e32 v4, 3, v4
; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX7-NEXT: v_lshl_b32_e32 v5, 0xffff, v4
+; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT: v_not_b32_e32 v5, v5
; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2
-; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v4
-; GFX7-NEXT: v_not_b32_e32 v6, v2
-; GFX7-NEXT: .LBB27_1: ; %atomicrmw.start
+; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
+; GFX7-NEXT: .LBB49_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3
-; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
-; GFX7-NEXT: v_and_b32_e32 v7, v3, v6
-; GFX7-NEXT: v_add_f32_e32 v2, v2, v5
-; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX7-NEXT: v_add_f32_e32 v2, v2, v6
+; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX7-NEXT: v_and_b32_e32 v7, v3, v5
; GFX7-NEXT: v_lshlrev_b32_e32 v2, v4, v2
; GFX7-NEXT: v_or_b32_e32 v2, v7, v2
; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
@@ -5965,302 +11099,457 @@ define void @flat_agent_atomic_fadd_noret_f16__offset12b_neg(ptr %ptr, half %val
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX7-NEXT: v_mov_b32_e32 v3, v2
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT: s_cbranch_execnz .LBB27_1
+; GFX7-NEXT: s_cbranch_execnz .LBB49_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_setpc_b64 s[30:31]
- %gep = getelementptr half, ptr %ptr, i64 -1024
- %unused = atomicrmw fadd ptr %gep, half %val syncscope("agent") seq_cst
+ %gep = getelementptr bfloat, ptr %ptr, i64 1023
+ %unused = atomicrmw fadd ptr %gep, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
ret void
}
-define half @flat_agent_atomic_fadd_ret_f16__offset12b_pos__align4(ptr %ptr, half %val) #0 {
-; GFX12-LABEL: flat_agent_atomic_fadd_ret_f16__offset12b_pos__align4:
+define void @flat_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr %ptr, bfloat %val) #0 {
+; GFX12-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: s_wait_expcnt 0x0
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2046
+; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
+; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX12-NEXT: v_lshlrev_b32_e32 v6, 16, v2
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX12-NEXT: v_and_b32_e32 v4, 3, v4
; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: .LBB28_1: ; %atomicrmw.start
+; GFX12-NEXT: flat_load_b32 v3, v[0:1]
+; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_not_b32_e32 v5, v5
+; GFX12-NEXT: .LBB50_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-NEXT: v_lshrrev_b32_e32 v2, v4, v3
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_add_f16_e32 v3, v4, v2
-; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
+; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX12-NEXT: v_add_f32_e32 v2, v2, v6
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo
+; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2
+; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 th:TH_ATOMIC_RETURN
+; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-NEXT: v_mov_b32_e32 v3, v2
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB28_1
+; GFX12-NEXT: s_cbranch_execnz .LBB50_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: v_mov_b32_e32 v0, v3
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
-; GFX940-LABEL: flat_agent_atomic_fadd_ret_f16__offset12b_pos__align4:
+; GFX940-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: flat_load_dword v3, v[0:1] offset:2046
+; GFX940-NEXT: s_movk_i32 s0, 0xf800
+; GFX940-NEXT: s_mov_b32 s1, -1
+; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
+; GFX940-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX940-NEXT: v_mov_b32_e32 v1, v5
+; GFX940-NEXT: flat_load_dword v3, v[0:1]
+; GFX940-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX940-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX940-NEXT: s_mov_b32 s0, 0xffff
+; GFX940-NEXT: v_lshlrev_b32_e64 v5, v4, s0
+; GFX940-NEXT: v_not_b32_e32 v5, v5
; GFX940-NEXT: s_mov_b64 s[0:1], 0
-; GFX940-NEXT: s_mov_b32 s2, 0xffff0000
-; GFX940-NEXT: .LBB28_1: ; %atomicrmw.start
+; GFX940-NEXT: v_lshlrev_b32_e32 v6, 16, v2
+; GFX940-NEXT: s_movk_i32 s2, 0x7fff
+; GFX940-NEXT: .LBB50_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_mov_b32_e32 v5, v3
-; GFX940-NEXT: v_add_f16_e32 v3, v5, v2
-; GFX940-NEXT: v_and_or_b32 v4, v5, s2, v3
+; GFX940-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX940-NEXT: s_nop 0
+; GFX940-NEXT: v_add_f32_e32 v2, v2, v6
+; GFX940-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX940-NEXT: v_add3_u32 v7, v7, v2, s2
+; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; GFX940-NEXT: s_nop 1
+; GFX940-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc
+; GFX940-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX940-NEXT: v_and_or_b32 v2, v3, v5, v2
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2046 sc0
+; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX940-NEXT: v_mov_b32_e32 v3, v2
; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT: s_cbranch_execnz .LBB28_1
+; GFX940-NEXT: s_cbranch_execnz .LBB50_1
; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX940-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX940-NEXT: v_mov_b32_e32 v0, v3
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: flat_agent_atomic_fadd_ret_f16__offset12b_pos__align4:
+; GFX11-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2046
+; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
+; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX11-NEXT: v_and_b32_e32 v4, 3, v4
; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: .LBB28_1: ; %atomicrmw.start
+; GFX11-NEXT: flat_load_b32 v3, v[0:1]
+; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX11-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_not_b32_e32 v5, v5
+; GFX11-NEXT: .p2align 6
+; GFX11-NEXT: .LBB50_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-NEXT: v_lshrrev_b32_e32 v2, v4, v3
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_f16_e32 v3, v4, v2
-; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
+; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-NEXT: v_add_f32_e32 v2, v2, v6
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo
+; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_lshlrev_b32_e32 v2, v4, v2
+; GFX11-NEXT: v_and_or_b32 v2, v3, v5, v2
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 glc
+; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-NEXT: v_mov_b32_e32 v3, v2
; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB28_1
+; GFX11-NEXT: s_cbranch_execnz .LBB50_1
; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: v_mov_b32_e32 v0, v3
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
-; GFX10-LABEL: flat_agent_atomic_fadd_ret_f16__offset12b_pos__align4:
+; GFX10-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
-; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, 0, v1, vcc_lo
+; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
+; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v2
+; GFX10-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX10-NEXT: v_and_b32_e32 v4, 3, v4
; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: flat_load_dword v0, v[3:4]
-; GFX10-NEXT: .LBB28_1: ; %atomicrmw.start
+; GFX10-NEXT: flat_load_dword v3, v[0:1]
+; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX10-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
+; GFX10-NEXT: v_not_b32_e32 v5, v5
+; GFX10-NEXT: .LBB50_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v1, v0
-; GFX10-NEXT: v_add_f16_e32 v0, v1, v2
-; GFX10-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX10-NEXT: v_and_or_b32 v0, 0xffff0000, v1, v0
+; GFX10-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX10-NEXT: v_add_f32_e32 v2, v2, v6
+; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX10-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX10-NEXT: v_and_or_b32 v2, v3, v5, v2
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
+; GFX10-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-NEXT: v_mov_b32_e32 v3, v2
; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: s_cbranch_execnz .LBB28_1
+; GFX10-NEXT: s_cbranch_execnz .LBB50_1
; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX90A-LABEL: flat_agent_atomic_fadd_ret_f16__offset12b_pos__align4:
+; GFX90A-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2046
+; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
+; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX90A-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX90A-NEXT: flat_load_dword v3, v[0:1]
+; GFX90A-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX90A-NEXT: s_mov_b32 s4, 0xffff
+; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v4, s4
+; GFX90A-NEXT: v_not_b32_e32 v5, v5
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: s_mov_b32 s6, 0xffff0000
-; GFX90A-NEXT: .LBB28_1: ; %atomicrmw.start
+; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v2
+; GFX90A-NEXT: s_movk_i32 s6, 0x7fff
+; GFX90A-NEXT: .LBB50_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v5, v3
-; GFX90A-NEXT: v_add_f16_e32 v3, v5, v2
-; GFX90A-NEXT: v_and_or_b32 v4, v5, s6, v3
-; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2046 glc
+; GFX90A-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX90A-NEXT: v_add_f32_e32 v2, v2, v6
+; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s6
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; GFX90A-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc
+; GFX90A-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX90A-NEXT: v_and_or_b32 v2, v3, v5, v2
+; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: s_cbranch_execnz .LBB28_1
+; GFX90A-NEXT: s_cbranch_execnz .LBB50_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: v_mov_b32_e32 v0, v3
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
-; GFX908-LABEL: flat_agent_atomic_fadd_ret_f16__offset12b_pos__align4:
+; GFX908-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2046
+; GFX908-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
+; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX908-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX908-NEXT: flat_load_dword v3, v[0:1]
+; GFX908-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX908-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX908-NEXT: s_mov_b32 s4, 0xffff
+; GFX908-NEXT: v_lshlrev_b32_e64 v5, v4, s4
+; GFX908-NEXT: v_not_b32_e32 v5, v5
; GFX908-NEXT: s_mov_b64 s[4:5], 0
-; GFX908-NEXT: s_mov_b32 s6, 0xffff0000
-; GFX908-NEXT: .LBB28_1: ; %atomicrmw.start
+; GFX908-NEXT: v_lshlrev_b32_e32 v6, 16, v2
+; GFX908-NEXT: s_movk_i32 s6, 0x7fff
+; GFX908-NEXT: .LBB50_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v4, v3
-; GFX908-NEXT: v_add_f16_e32 v3, v4, v2
-; GFX908-NEXT: v_and_or_b32 v3, v4, s6, v3
-; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2046 glc
+; GFX908-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX908-NEXT: v_add_f32_e32 v2, v2, v6
+; GFX908-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX908-NEXT: v_add3_u32 v7, v7, v2, s6
+; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; GFX908-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc
+; GFX908-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX908-NEXT: v_and_or_b32 v2, v3, v5, v2
+; GFX908-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT: v_mov_b32_e32 v3, v2
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT: s_cbranch_execnz .LBB28_1
+; GFX908-NEXT: s_cbranch_execnz .LBB50_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX908-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX908-NEXT: v_mov_b32_e32 v0, v3
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
-; GFX8-LABEL: flat_agent_atomic_fadd_ret_f16__offset12b_pos__align4:
+; GFX8-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fe, v0
-; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_load_dword v0, v[3:4]
+; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0xfffff800, v0
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc
+; GFX8-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX8-NEXT: flat_load_dword v3, v[0:1]
+; GFX8-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX8-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX8-NEXT: s_mov_b32 s4, 0xffff
+; GFX8-NEXT: v_lshlrev_b32_e64 v5, v4, s4
+; GFX8-NEXT: v_not_b32_e32 v5, v5
; GFX8-NEXT: s_mov_b64 s[4:5], 0
-; GFX8-NEXT: .LBB28_1: ; %atomicrmw.start
+; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v2
+; GFX8-NEXT: .LBB50_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, v0
-; GFX8-NEXT: v_add_f16_e32 v0, v1, v2
-; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
-; GFX8-NEXT: v_or_b32_e32 v0, v5, v0
-; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
+; GFX8-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_add_f32_e32 v2, v2, v6
+; GFX8-NEXT: v_bfe_u32 v8, v2, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v2
+; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8
+; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; GFX8-NEXT: v_cndmask_b32_e32 v2, v8, v9, vcc
+; GFX8-NEXT: v_and_b32_e32 v7, v3, v5
+; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT: v_or_b32_e32 v2, v7, v2
+; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT: v_mov_b32_e32 v3, v2
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT: s_cbranch_execnz .LBB28_1
+; GFX8-NEXT: s_cbranch_execnz .LBB50_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX7-LABEL: flat_agent_atomic_fadd_ret_f16__offset12b_pos__align4:
+; GFX7-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7fe, v0
-; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0xfffff800, v0
+; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc
+; GFX7-NEXT: v_and_b32_e32 v0, -4, v4
; GFX7-NEXT: flat_load_dword v3, v[0:1]
-; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX7-NEXT: v_lshl_b32_e32 v5, 0xffff, v4
+; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT: v_not_b32_e32 v5, v5
; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
-; GFX7-NEXT: .LBB28_1: ; %atomicrmw.start
+; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
+; GFX7-NEXT: .LBB50_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v4, v3
-; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v4
-; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
-; GFX7-NEXT: v_add_f32_e32 v3, v3, v2
-; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3
-; GFX7-NEXT: v_or_b32_e32 v3, v5, v3
-; GFX7-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX7-NEXT: v_add_f32_e32 v2, v2, v6
+; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX7-NEXT: v_and_b32_e32 v7, v3, v5
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, v4, v2
+; GFX7-NEXT: v_or_b32_e32 v2, v7, v2
+; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT: v_mov_b32_e32 v3, v2
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT: s_cbranch_execnz .LBB28_1
+; GFX7-NEXT: s_cbranch_execnz .LBB50_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v3
; GFX7-NEXT: s_setpc_b64 s[30:31]
- %gep = getelementptr half, ptr %ptr, i64 1023
- %result = atomicrmw fadd ptr %gep, half %val syncscope("agent") seq_cst, align 4
- ret half %result
+ %gep = getelementptr bfloat, ptr %ptr, i64 -1024
+ %unused = atomicrmw fadd ptr %gep, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+ ret void
}
-define void @flat_agent_atomic_fadd_noret_f16__offset12b__align4_pos(ptr %ptr, half %val) #0 {
-; GFX12-LABEL: flat_agent_atomic_fadd_noret_f16__offset12b__align4_pos:
+define bfloat @flat_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory(ptr %ptr, bfloat %val) #0 {
+; GFX12-LABEL: flat_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: s_wait_expcnt 0x0
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: flat_load_b32 v4, v[0:1] offset:2046
+; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2046
+; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: .LBB29_1: ; %atomicrmw.start
+; GFX12-NEXT: .LBB51_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_add_f16_e32 v3, v4, v2
+; GFX12-NEXT: v_mov_b32_e32 v4, v3
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v4
+; GFX12-NEXT: v_add_f32_e32 v3, v3, v2
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-NEXT: v_bfe_u32 v5, v3, 16, 1
+; GFX12-NEXT: v_or_b32_e32 v6, 0x400000, v3
+; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX12-NEXT: v_add3_u32 v5, v5, v3, 0x7fff
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo
+; GFX12-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT: v_mov_b32_e32 v4, v3
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB29_1
+; GFX12-NEXT: s_cbranch_execnz .LBB51_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: v_mov_b32_e32 v0, v3
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
-; GFX940-LABEL: flat_agent_atomic_fadd_noret_f16__offset12b__align4_pos:
+; GFX940-LABEL: flat_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: flat_load_dword v5, v[0:1] offset:2046
+; GFX940-NEXT: flat_load_dword v3, v[0:1] offset:2046
; GFX940-NEXT: s_mov_b64 s[0:1], 0
-; GFX940-NEXT: s_mov_b32 s2, 0xffff0000
-; GFX940-NEXT: .LBB29_1: ; %atomicrmw.start
+; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX940-NEXT: s_movk_i32 s2, 0x7fff
+; GFX940-NEXT: s_mov_b32 s3, 0xffff0000
+; GFX940-NEXT: .LBB51_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_add_f16_e32 v3, v5, v2
-; GFX940-NEXT: v_and_or_b32 v4, v5, s2, v3
+; GFX940-NEXT: v_mov_b32_e32 v5, v3
+; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v5
+; GFX940-NEXT: v_add_f32_e32 v3, v3, v2
+; GFX940-NEXT: v_bfe_u32 v4, v3, 16, 1
+; GFX940-NEXT: v_or_b32_e32 v6, 0x400000, v3
+; GFX940-NEXT: v_add3_u32 v4, v4, v3, s2
+; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; GFX940-NEXT: s_nop 1
+; GFX940-NEXT: v_cndmask_b32_e32 v3, v4, v6, vcc
+; GFX940-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX940-NEXT: v_and_or_b32 v4, v5, s3, v3
; GFX940-NEXT: buffer_wbl2 sc1
; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2046 sc0
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_inv sc1
; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT: v_mov_b32_e32 v5, v3
; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT: s_cbranch_execnz .LBB29_1
+; GFX940-NEXT: s_cbranch_execnz .LBB51_1
; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX940-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX940-NEXT: v_mov_b32_e32 v0, v3
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: flat_agent_atomic_fadd_noret_f16__offset12b__align4_pos:
+; GFX11-LABEL: flat_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: flat_load_b32 v4, v[0:1] offset:2046
+; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2046
+; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: .LBB29_1: ; %atomicrmw.start
+; GFX11-NEXT: .p2align 6
+; GFX11-NEXT: .LBB51_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_add_f16_e32 v3, v4, v2
+; GFX11-NEXT: v_mov_b32_e32 v4, v3
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v4
+; GFX11-NEXT: v_add_f32_e32 v3, v3, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_bfe_u32 v5, v3, 16, 1
+; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v3
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-NEXT: v_add3_u32 v5, v5, v3, 0x7fff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo
+; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 glc
@@ -6268,457 +11557,468 @@ define void @flat_agent_atomic_fadd_noret_f16__offset12b__align4_pos(ptr %ptr, h
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT: v_mov_b32_e32 v4, v3
; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB29_1
+; GFX11-NEXT: s_cbranch_execnz .LBB51_1
; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT: v_mov_b32_e32 v0, v3
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
-; GFX10-LABEL: flat_agent_atomic_fadd_noret_f16__offset12b__align4_pos:
+; GFX10-LABEL: flat_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fe, v0
-; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, 0, v1, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v2
; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: flat_load_dword v4, v[0:1]
-; GFX10-NEXT: .LBB29_1: ; %atomicrmw.start
+; GFX10-NEXT: flat_load_dword v0, v[3:4]
+; GFX10-NEXT: .LBB51_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_add_f16_e32 v3, v4, v2
-; GFX10-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX10-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
+; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_mov_b32_e32 v6, v0
+; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v6
+; GFX10-NEXT: v_add_f32_e32 v0, v0, v1
+; GFX10-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v0
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX10-NEXT: v_add3_u32 v2, v2, v0, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v5, vcc_lo
+; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX10-NEXT: v_and_or_b32 v5, 0xffff0000, v6, v0
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX10-NEXT: v_mov_b32_e32 v4, v3
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6
; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: s_cbranch_execnz .LBB29_1
+; GFX10-NEXT: s_cbranch_execnz .LBB51_1
; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX90A-LABEL: flat_agent_atomic_fadd_noret_f16__offset12b__align4_pos:
+; GFX90A-LABEL: flat_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: flat_load_dword v5, v[0:1] offset:2046
+; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2046
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: s_mov_b32 s6, 0xffff0000
-; GFX90A-NEXT: .LBB29_1: ; %atomicrmw.start
+; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX90A-NEXT: s_movk_i32 s6, 0x7fff
+; GFX90A-NEXT: s_mov_b32 s7, 0xffff0000
+; GFX90A-NEXT: .LBB51_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_add_f16_e32 v3, v5, v2
-; GFX90A-NEXT: v_and_or_b32 v4, v5, s6, v3
+; GFX90A-NEXT: v_mov_b32_e32 v5, v3
+; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v5
+; GFX90A-NEXT: v_add_f32_e32 v3, v3, v2
+; GFX90A-NEXT: v_bfe_u32 v4, v3, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v3
+; GFX90A-NEXT: v_add3_u32 v4, v4, v3, s6
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; GFX90A-NEXT: v_cndmask_b32_e32 v3, v4, v6, vcc
+; GFX90A-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX90A-NEXT: v_and_or_b32 v4, v5, s7, v3
; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2046 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: v_mov_b32_e32 v5, v3
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: s_cbranch_execnz .LBB29_1
+; GFX90A-NEXT: s_cbranch_execnz .LBB51_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v3
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
-; GFX908-LABEL: flat_agent_atomic_fadd_noret_f16__offset12b__align4_pos:
+; GFX908-LABEL: flat_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: flat_load_dword v4, v[0:1] offset:2046
+; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2046
; GFX908-NEXT: s_mov_b64 s[4:5], 0
-; GFX908-NEXT: s_mov_b32 s6, 0xffff0000
-; GFX908-NEXT: .LBB29_1: ; %atomicrmw.start
+; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX908-NEXT: s_movk_i32 s6, 0x7fff
+; GFX908-NEXT: s_mov_b32 s7, 0xffff0000
+; GFX908-NEXT: .LBB51_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_add_f16_e32 v3, v4, v2
-; GFX908-NEXT: v_and_or_b32 v3, v4, s6, v3
+; GFX908-NEXT: v_mov_b32_e32 v4, v3
+; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v4
+; GFX908-NEXT: v_add_f32_e32 v3, v3, v2
+; GFX908-NEXT: v_bfe_u32 v5, v3, 16, 1
+; GFX908-NEXT: v_or_b32_e32 v6, 0x400000, v3
+; GFX908-NEXT: v_add3_u32 v5, v5, v3, s6
+; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; GFX908-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc
+; GFX908-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX908-NEXT: v_and_or_b32 v3, v4, s7, v3
; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2046 glc
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT: v_mov_b32_e32 v4, v3
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT: s_cbranch_execnz .LBB29_1
+; GFX908-NEXT: s_cbranch_execnz .LBB51_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX908-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT: v_mov_b32_e32 v0, v3
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
-; GFX8-LABEL: flat_agent_atomic_fadd_noret_f16__offset12b__align4_pos:
+; GFX8-LABEL: flat_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fe, v0
-; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_load_dword v4, v[0:1]
+; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fe, v0
+; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc
+; GFX8-NEXT: flat_load_dword v0, v[3:4]
; GFX8-NEXT: s_mov_b64 s[4:5], 0
-; GFX8-NEXT: .LBB29_1: ; %atomicrmw.start
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2
+; GFX8-NEXT: .LBB51_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_add_f16_e32 v3, v4, v2
-; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
-; GFX8-NEXT: v_or_b32_e32 v3, v5, v3
-; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT: v_mov_b32_e32 v6, v0
+; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v6
+; GFX8-NEXT: v_add_f32_e32 v0, v0, v1
+; GFX8-NEXT: v_bfe_u32 v5, v0, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v0
+; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5
+; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v0
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
+; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v6
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v5, v7, vcc
+; GFX8-NEXT: v_or_b32_sdwa v5, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT: v_mov_b32_e32 v4, v3
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT: s_cbranch_execnz .LBB29_1
+; GFX8-NEXT: s_cbranch_execnz .LBB51_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX7-LABEL: flat_agent_atomic_fadd_noret_f16__offset12b__align4_pos:
+; GFX7-LABEL: flat_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7fe, v0
; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX7-NEXT: flat_load_dword v3, v[0:1]
-; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v2
-; GFX7-NEXT: .LBB29_1: ; %atomicrmw.start
+; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT: .LBB51_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v3
-; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
-; GFX7-NEXT: v_add_f32_e32 v2, v2, v4
-; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX7-NEXT: v_or_b32_e32 v2, v5, v2
-; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX7-NEXT: v_mov_b32_e32 v4, v3
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v4
+; GFX7-NEXT: v_add_f32_e32 v3, v3, v2
+; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_or_b32_e32 v3, v5, v3
+; GFX7-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT: v_mov_b32_e32 v3, v2
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT: s_cbranch_execnz .LBB29_1
+; GFX7-NEXT: s_cbranch_execnz .LBB51_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v3
; GFX7-NEXT: s_setpc_b64 s[30:31]
- %gep = getelementptr half, ptr %ptr, i64 1023
- %unused = atomicrmw fadd ptr %gep, half %val syncscope("agent") seq_cst, align 4
- ret void
+ %gep = getelementptr bfloat, ptr %ptr, i64 1023
+ %result = atomicrmw fadd ptr %gep, bfloat %val syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0
+ ret bfloat %result
}
-define half @flat_system_atomic_fadd_ret_f16__offset12b_pos(ptr %ptr, half %val) #0 {
-; GFX12-LABEL: flat_system_atomic_fadd_ret_f16__offset12b_pos:
+define void @flat_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory(ptr %ptr, bfloat %val) #0 {
+; GFX12-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: s_wait_expcnt 0x0
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
-; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2046
+; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX12-NEXT: flat_load_b32 v5, v[0:1]
-; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3
-; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_not_b32_e32 v4, v4
-; GFX12-NEXT: .LBB30_1: ; %atomicrmw.start
+; GFX12-NEXT: .LBB52_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v6, v5
+; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v3
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6
-; GFX12-NEXT: v_add_f16_e32 v5, v5, v2
+; GFX12-NEXT: v_add_f32_e32 v2, v2, v4
+; GFX12-NEXT: v_bfe_u32 v5, v2, 16, 1
+; GFX12-NEXT: v_or_b32_e32 v6, 0x400000, v2
+; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_add3_u32 v5, v5, v2, 0x7fff
+; GFX12-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc_lo
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX12-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN
+; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2046 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_SYS
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-NEXT: v_mov_b32_e32 v3, v2
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB30_1
+; GFX12-NEXT: s_cbranch_execnz .LBB52_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
-; GFX940-LABEL: flat_system_atomic_fadd_ret_f16__offset12b_pos:
+; GFX940-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: s_mov_b64 s[0:1], 0x7fe
-; GFX940-NEXT: v_lshl_add_u64 v[6:7], v[0:1], 0, s[0:1]
-; GFX940-NEXT: v_and_b32_e32 v0, -4, v6
-; GFX940-NEXT: v_mov_b32_e32 v1, v7
-; GFX940-NEXT: flat_load_dword v4, v[0:1]
-; GFX940-NEXT: v_and_b32_e32 v3, 3, v6
-; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3
-; GFX940-NEXT: s_mov_b32 s0, 0xffff
-; GFX940-NEXT: v_lshlrev_b32_e64 v5, v3, s0
-; GFX940-NEXT: v_not_b32_e32 v5, v5
+; GFX940-NEXT: flat_load_dword v3, v[0:1] offset:2046
; GFX940-NEXT: s_mov_b64 s[0:1], 0
-; GFX940-NEXT: .LBB30_1: ; %atomicrmw.start
+; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX940-NEXT: s_movk_i32 s2, 0x7fff
+; GFX940-NEXT: s_mov_b32 s3, 0xffff0000
+; GFX940-NEXT: .LBB52_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_mov_b32_e32 v7, v4
-; GFX940-NEXT: v_lshrrev_b32_e32 v4, v3, v7
-; GFX940-NEXT: v_add_f16_e32 v4, v4, v2
-; GFX940-NEXT: v_lshlrev_b32_e32 v4, v3, v4
-; GFX940-NEXT: v_and_or_b32 v6, v7, v5, v4
-; GFX940-NEXT: buffer_wbl2 sc0 sc1
-; GFX940-NEXT: flat_atomic_cmpswap v4, v[0:1], v[6:7] sc0 sc1
+; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX940-NEXT: v_add_f32_e32 v2, v2, v4
+; GFX940-NEXT: v_bfe_u32 v5, v2, 16, 1
+; GFX940-NEXT: v_or_b32_e32 v6, 0x400000, v2
+; GFX940-NEXT: v_add3_u32 v5, v5, v2, s2
+; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; GFX940-NEXT: s_nop 1
+; GFX940-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc
+; GFX940-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX940-NEXT: v_and_or_b32 v2, v3, s3, v2
+; GFX940-NEXT: buffer_wbl2 sc1
+; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2046 sc0
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT: buffer_inv sc0 sc1
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7
+; GFX940-NEXT: buffer_inv sc1
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX940-NEXT: v_mov_b32_e32 v3, v2
; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT: s_cbranch_execnz .LBB30_1
+; GFX940-NEXT: s_cbranch_execnz .LBB52_1
; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX940-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v4
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: flat_system_atomic_fadd_ret_f16__offset12b_pos:
+; GFX11-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2046
+; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX11-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX11-NEXT: flat_load_b32 v5, v[0:1]
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3
-; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_not_b32_e32 v4, v4
-; GFX11-NEXT: .LBB30_1: ; %atomicrmw.start
+; GFX11-NEXT: .p2align 6
+; GFX11-NEXT: .LBB52_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v6, v5
+; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6
-; GFX11-NEXT: v_add_f16_e32 v5, v5, v2
+; GFX11-NEXT: v_add_f32_e32 v2, v2, v4
+; GFX11-NEXT: v_bfe_u32 v5, v2, 16, 1
+; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v2
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_add3_u32 v5, v5, v2, 0x7fff
+; GFX11-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc_lo
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc
+; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2046 glc
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-NEXT: v_mov_b32_e32 v3, v2
; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB30_1
+; GFX11-NEXT: s_cbranch_execnz .LBB52_1
; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
-; GFX10-LABEL: flat_system_atomic_fadd_ret_f16__offset12b_pos:
+; GFX10-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fe, v0
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX10-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX10-NEXT: flat_load_dword v5, v[0:1]
-; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3
-; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX10-NEXT: v_not_b32_e32 v4, v4
-; GFX10-NEXT: .LBB30_1: ; %atomicrmw.start
-; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v6, v5
-; GFX10-NEXT: v_lshrrev_b32_e32 v5, v3, v6
-; GFX10-NEXT: v_add_f16_e32 v5, v5, v2
-; GFX10-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-; GFX10-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX10-NEXT: flat_load_dword v3, v[0:1]
+; GFX10-NEXT: .LBB52_1: ; %atomicrmw.start
+; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX10-NEXT: v_add_f32_e32 v2, v2, v4
+; GFX10-NEXT: v_bfe_u32 v5, v2, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v2
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX10-NEXT: v_add3_u32 v5, v5, v2, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc_lo
+; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX10-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
+; GFX10-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-NEXT: v_mov_b32_e32 v3, v2
; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: s_cbranch_execnz .LBB30_1
+; GFX10-NEXT: s_cbranch_execnz .LBB52_1
; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: v_lshrrev_b32_e32 v0, v3, v5
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX90A-LABEL: flat_system_atomic_fadd_ret_f16__offset12b_pos:
+; GFX90A-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0
-; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX90A-NEXT: flat_load_dword v4, v[0:1]
-; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3
-; GFX90A-NEXT: s_mov_b32 s4, 0xffff
-; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v3, s4
-; GFX90A-NEXT: v_not_b32_e32 v5, v5
+; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2046
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: .LBB30_1: ; %atomicrmw.start
+; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX90A-NEXT: s_movk_i32 s6, 0x7fff
+; GFX90A-NEXT: s_mov_b32 s7, 0xffff0000
+; GFX90A-NEXT: .LBB52_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v7, v4
-; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v7
-; GFX90A-NEXT: v_add_f16_e32 v4, v4, v2
-; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4
-; GFX90A-NEXT: v_and_or_b32 v6, v7, v5, v4
-; GFX90A-NEXT: buffer_wbl2
-; GFX90A-NEXT: flat_atomic_cmpswap v4, v[0:1], v[6:7] glc
+; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX90A-NEXT: v_add_f32_e32 v2, v2, v4
+; GFX90A-NEXT: v_bfe_u32 v5, v2, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v2
+; GFX90A-NEXT: v_add3_u32 v5, v5, v2, s6
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; GFX90A-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc
+; GFX90A-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX90A-NEXT: v_and_or_b32 v2, v3, s7, v2
+; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2046 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: buffer_invl2
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: s_cbranch_execnz .LBB30_1
+; GFX90A-NEXT: s_cbranch_execnz .LBB52_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v4
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
-; GFX908-LABEL: flat_system_atomic_fadd_ret_f16__offset12b_pos:
+; GFX908-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0
-; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX908-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX908-NEXT: flat_load_dword v4, v[0:1]
-; GFX908-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX908-NEXT: v_lshlrev_b32_e32 v3, 3, v3
-; GFX908-NEXT: s_mov_b32 s4, 0xffff
-; GFX908-NEXT: v_lshlrev_b32_e64 v5, v3, s4
-; GFX908-NEXT: v_not_b32_e32 v5, v5
+; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2046
; GFX908-NEXT: s_mov_b64 s[4:5], 0
-; GFX908-NEXT: .LBB30_1: ; %atomicrmw.start
+; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX908-NEXT: s_movk_i32 s6, 0x7fff
+; GFX908-NEXT: s_mov_b32 s7, 0xffff0000
+; GFX908-NEXT: .LBB52_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v7, v4
-; GFX908-NEXT: v_lshrrev_b32_e32 v4, v3, v7
-; GFX908-NEXT: v_add_f16_e32 v4, v4, v2
-; GFX908-NEXT: v_lshlrev_b32_e32 v4, v3, v4
-; GFX908-NEXT: v_and_or_b32 v6, v7, v5, v4
-; GFX908-NEXT: flat_atomic_cmpswap v4, v[0:1], v[6:7] glc
+; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX908-NEXT: v_add_f32_e32 v2, v2, v4
+; GFX908-NEXT: v_bfe_u32 v5, v2, 16, 1
+; GFX908-NEXT: v_or_b32_e32 v6, 0x400000, v2
+; GFX908-NEXT: v_add3_u32 v5, v5, v2, s6
+; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; GFX908-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc
+; GFX908-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX908-NEXT: v_and_or_b32 v2, v3, s7, v2
+; GFX908-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2046 glc
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT: v_mov_b32_e32 v3, v2
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT: s_cbranch_execnz .LBB30_1
+; GFX908-NEXT: s_cbranch_execnz .LBB52_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX908-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v4
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
-; GFX8-LABEL: flat_system_atomic_fadd_ret_f16__offset12b_pos:
+; GFX8-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fe, v0
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fe, v0
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX8-NEXT: flat_load_dword v5, v[0:1]
-; GFX8-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX8-NEXT: v_lshlrev_b32_e32 v3, 3, v3
-; GFX8-NEXT: s_mov_b32 s4, 0xffff
-; GFX8-NEXT: v_lshlrev_b32_e64 v4, v3, s4
-; GFX8-NEXT: v_not_b32_e32 v4, v4
+; GFX8-NEXT: flat_load_dword v3, v[0:1]
; GFX8-NEXT: s_mov_b64 s[4:5], 0
-; GFX8-NEXT: .LBB30_1: ; %atomicrmw.start
+; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX8-NEXT: .LBB52_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v6, v5
-; GFX8-NEXT: v_lshrrev_b32_e32 v5, v3, v6
-; GFX8-NEXT: v_add_f16_e32 v5, v5, v2
-; GFX8-NEXT: v_and_b32_e32 v7, v6, v4
-; GFX8-NEXT: v_lshlrev_b32_e32 v5, v3, v5
-; GFX8-NEXT: v_or_b32_e32 v5, v7, v5
-; GFX8-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX8-NEXT: v_add_f32_e32 v2, v2, v4
+; GFX8-NEXT: v_bfe_u32 v6, v2, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v2
+; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6
+; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v2
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
+; GFX8-NEXT: v_cndmask_b32_e32 v2, v6, v7, vcc
+; GFX8-NEXT: v_or_b32_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT: v_mov_b32_e32 v3, v2
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT: s_cbranch_execnz .LBB30_1
+; GFX8-NEXT: s_cbranch_execnz .LBB52_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, v3, v5
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX7-LABEL: flat_system_atomic_fadd_ret_f16__offset12b_pos:
+; GFX7-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_add_i32_e32 v3, vcc, 0x7fe, v0
+; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7fe, v0
; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX7-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX7-NEXT: flat_load_dword v5, v[0:1]
-; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v2
-; GFX7-NEXT: v_and_b32_e32 v2, 3, v3
-; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2
+; GFX7-NEXT: flat_load_dword v3, v[0:1]
+; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v4
-; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v2
-; GFX7-NEXT: v_not_b32_e32 v4, v4
-; GFX7-NEXT: .LBB30_1: ; %atomicrmw.start
+; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
+; GFX7-NEXT: .LBB52_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v6, v5
-; GFX7-NEXT: v_lshrrev_b32_e32 v5, v2, v6
-; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5
-; GFX7-NEXT: v_and_b32_e32 v7, v6, v4
-; GFX7-NEXT: v_add_f32_e32 v5, v5, v3
-; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5
-; GFX7-NEXT: v_lshlrev_b32_e32 v5, v2, v5
-; GFX7-NEXT: v_or_b32_e32 v5, v7, v5
-; GFX7-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX7-NEXT: v_add_f32_e32 v2, v2, v4
+; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
+; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX7-NEXT: v_or_b32_e32 v2, v5, v2
+; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT: v_mov_b32_e32 v3, v2
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT: s_cbranch_execnz .LBB30_1
+; GFX7-NEXT: s_cbranch_execnz .LBB52_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, v2, v5
-; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
- %gep = getelementptr half, ptr %ptr, i64 1023
- %result = atomicrmw fadd ptr %gep, half %val seq_cst
- ret half %result
+ %gep = getelementptr bfloat, ptr %ptr, i64 1023
+ %unused = atomicrmw fadd ptr %gep, bfloat %val syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0
+ ret void
}
-define void @flat_system_atomic_fadd_noret_f16__offset12b_pos(ptr %ptr, half %val) #0 {
-; GFX12-LABEL: flat_system_atomic_fadd_noret_f16__offset12b_pos:
+define void @flat_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory(ptr %ptr, bfloat %val) #0 {
+; GFX12-LABEL: flat_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: s_wait_expcnt 0x0
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
-; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX12-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2
; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
; GFX12-NEXT: flat_load_b32 v4, v[0:1]
@@ -6726,71 +12026,85 @@ define void @flat_system_atomic_fadd_noret_f16__offset12b_pos(ptr %ptr, half %va
; GFX12-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_not_b32_e32 v6, v3
-; GFX12-NEXT: .LBB31_1: ; %atomicrmw.start
+; GFX12-NEXT: .LBB53_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: v_lshrrev_b32_e32 v3, v5, v4
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_add_f16_e32 v3, v3, v2
-; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX12-NEXT: v_add_f32_e32 v3, v3, v2
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v3
+; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc_lo
+; GFX12-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3
; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_SYS
+; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
; GFX12-NEXT: v_mov_b32_e32 v4, v3
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB31_1
+; GFX12-NEXT: s_cbranch_execnz .LBB53_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
-; GFX940-LABEL: flat_system_atomic_fadd_noret_f16__offset12b_pos:
+; GFX940-LABEL: flat_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: s_mov_b64 s[0:1], 0x7fe
-; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
-; GFX940-NEXT: v_and_b32_e32 v0, -4, v4
-; GFX940-NEXT: v_mov_b32_e32 v1, v5
+; GFX940-NEXT: v_mov_b32_e32 v3, v0
+; GFX940-NEXT: v_and_b32_e32 v0, -4, v3
; GFX940-NEXT: flat_load_dword v5, v[0:1]
-; GFX940-NEXT: v_and_b32_e32 v3, 3, v4
+; GFX940-NEXT: v_and_b32_e32 v3, 3, v3
; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3
; GFX940-NEXT: s_mov_b32 s0, 0xffff
; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0
; GFX940-NEXT: v_not_b32_e32 v6, v4
; GFX940-NEXT: s_mov_b64 s[0:1], 0
-; GFX940-NEXT: .LBB31_1: ; %atomicrmw.start
+; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX940-NEXT: s_movk_i32 s2, 0x7fff
+; GFX940-NEXT: .LBB53_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_lshrrev_b32_e32 v4, v3, v5
-; GFX940-NEXT: v_add_f16_e32 v4, v4, v2
-; GFX940-NEXT: v_lshlrev_b32_e32 v4, v3, v4
+; GFX940-NEXT: v_lshrrev_b32_sdwa v4, v3, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX940-NEXT: s_nop 0
+; GFX940-NEXT: v_add_f32_e32 v4, v4, v2
+; GFX940-NEXT: v_bfe_u32 v7, v4, 16, 1
+; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v4
+; GFX940-NEXT: v_add3_u32 v7, v7, v4, s2
+; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
+; GFX940-NEXT: s_nop 1
+; GFX940-NEXT: v_cndmask_b32_e32 v4, v7, v8, vcc
+; GFX940-NEXT: v_lshlrev_b32_sdwa v4, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX940-NEXT: v_and_or_b32 v4, v5, v6, v4
-; GFX940-NEXT: buffer_wbl2 sc0 sc1
-; GFX940-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] sc0 sc1
+; GFX940-NEXT: buffer_wbl2 sc1
+; GFX940-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] sc0
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT: buffer_inv sc0 sc1
+; GFX940-NEXT: buffer_inv sc1
; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5
; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX940-NEXT: v_mov_b32_e32 v5, v4
; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT: s_cbranch_execnz .LBB31_1
+; GFX940-NEXT: s_cbranch_execnz .LBB53_1
; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX940-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: flat_system_atomic_fadd_noret_f16__offset12b_pos:
+; GFX11-LABEL: flat_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX11-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2
; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_and_b32_e32 v0, -4, v3
; GFX11-NEXT: v_and_b32_e32 v3, 3, v3
; GFX11-NEXT: flat_load_b32 v4, v[0:1]
@@ -6798,13 +12112,22 @@ define void @flat_system_atomic_fadd_noret_f16__offset12b_pos(ptr %ptr, half %va
; GFX11-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_not_b32_e32 v6, v3
-; GFX11-NEXT: .LBB31_1: ; %atomicrmw.start
+; GFX11-NEXT: .p2align 6
+; GFX11-NEXT: .LBB53_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_lshrrev_b32_e32 v3, v5, v4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_f16_e32 v3, v3, v2
-; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-NEXT: v_add_f32_e32 v3, v3, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v3
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc_lo
+; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_lshlrev_b32_e32 v3, v5, v3
; GFX11-NEXT: v_and_or_b32 v3, v4, v6, v3
@@ -6818,16 +12141,16 @@ define void @flat_system_atomic_fadd_noret_f16__offset12b_pos(ptr %ptr, half %va
; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB31_1
+; GFX11-NEXT: s_cbranch_execnz .LBB53_1
; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
-; GFX10-LABEL: flat_system_atomic_fadd_noret_f16__offset12b_pos:
+; GFX10-LABEL: flat_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
-; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX10-NEXT: v_mov_b32_e32 v3, v0
+; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX10-NEXT: s_mov_b32 s4, 0
; GFX10-NEXT: v_and_b32_e32 v0, -4, v3
; GFX10-NEXT: v_and_b32_e32 v3, 3, v3
@@ -6835,12 +12158,17 @@ define void @flat_system_atomic_fadd_noret_f16__offset12b_pos(ptr %ptr, half %va
; GFX10-NEXT: v_lshlrev_b32_e32 v5, 3, v3
; GFX10-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
; GFX10-NEXT: v_not_b32_e32 v6, v3
-; GFX10-NEXT: .LBB31_1: ; %atomicrmw.start
-; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_lshrrev_b32_e32 v3, v5, v4
-; GFX10-NEXT: v_add_f16_e32 v3, v3, v2
-; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX10-NEXT: .LBB53_1: ; %atomicrmw.start
+; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_lshrrev_b32_sdwa v3, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX10-NEXT: v_add_f32_e32 v3, v3, v2
+; GFX10-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX10-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX10-NEXT: v_and_or_b32 v3, v4, v6, v3
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
@@ -6851,16 +12179,15 @@ define void @flat_system_atomic_fadd_noret_f16__offset12b_pos(ptr %ptr, half %va
; GFX10-NEXT: v_mov_b32_e32 v4, v3
; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: s_cbranch_execnz .LBB31_1
+; GFX10-NEXT: s_cbranch_execnz .LBB53_1
; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX90A-LABEL: flat_system_atomic_fadd_noret_f16__offset12b_pos:
+; GFX90A-LABEL: flat_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0
-; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX90A-NEXT: v_mov_b32_e32 v3, v0
; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3
; GFX90A-NEXT: flat_load_dword v5, v[0:1]
; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3
@@ -6869,32 +12196,36 @@ define void @flat_system_atomic_fadd_noret_f16__offset12b_pos(ptr %ptr, half %va
; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4
; GFX90A-NEXT: v_not_b32_e32 v6, v4
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: .LBB31_1: ; %atomicrmw.start
+; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX90A-NEXT: s_movk_i32 s6, 0x7fff
+; GFX90A-NEXT: .LBB53_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v5
-; GFX90A-NEXT: v_add_f16_e32 v4, v4, v2
-; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4
+; GFX90A-NEXT: v_lshrrev_b32_sdwa v4, v3, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX90A-NEXT: v_add_f32_e32 v4, v4, v2
+; GFX90A-NEXT: v_bfe_u32 v7, v4, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v4
+; GFX90A-NEXT: v_add3_u32 v7, v7, v4, s6
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
+; GFX90A-NEXT: v_cndmask_b32_e32 v4, v7, v8, vcc
+; GFX90A-NEXT: v_lshlrev_b32_sdwa v4, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX90A-NEXT: v_and_or_b32 v4, v5, v6, v4
-; GFX90A-NEXT: buffer_wbl2
; GFX90A-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: buffer_invl2
; GFX90A-NEXT: buffer_wbinvl1
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: v_mov_b32_e32 v5, v4
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: s_cbranch_execnz .LBB31_1
+; GFX90A-NEXT: s_cbranch_execnz .LBB53_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
-; GFX908-LABEL: flat_system_atomic_fadd_noret_f16__offset12b_pos:
+; GFX908-LABEL: flat_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0
-; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX908-NEXT: v_mov_b32_e32 v3, v0
; GFX908-NEXT: v_and_b32_e32 v0, -4, v3
; GFX908-NEXT: flat_load_dword v4, v[0:1]
; GFX908-NEXT: v_and_b32_e32 v3, 3, v3
@@ -6903,12 +12234,19 @@ define void @flat_system_atomic_fadd_noret_f16__offset12b_pos(ptr %ptr, half %va
; GFX908-NEXT: v_lshlrev_b32_e64 v3, v5, s4
; GFX908-NEXT: v_not_b32_e32 v6, v3
; GFX908-NEXT: s_mov_b64 s[4:5], 0
-; GFX908-NEXT: .LBB31_1: ; %atomicrmw.start
+; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX908-NEXT: s_movk_i32 s6, 0x7fff
+; GFX908-NEXT: .LBB53_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_lshrrev_b32_e32 v3, v5, v4
-; GFX908-NEXT: v_add_f16_e32 v3, v3, v2
-; GFX908-NEXT: v_lshlrev_b32_e32 v3, v5, v3
+; GFX908-NEXT: v_lshrrev_b32_sdwa v3, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX908-NEXT: v_add_f32_e32 v3, v3, v2
+; GFX908-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v3
+; GFX908-NEXT: v_add3_u32 v7, v7, v3, s6
+; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; GFX908-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc
+; GFX908-NEXT: v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX908-NEXT: v_and_or_b32 v3, v4, v6, v3
; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -6917,16 +12255,15 @@ define void @flat_system_atomic_fadd_noret_f16__offset12b_pos(ptr %ptr, half %va
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX908-NEXT: v_mov_b32_e32 v4, v3
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT: s_cbranch_execnz .LBB31_1
+; GFX908-NEXT: s_cbranch_execnz .LBB53_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX908-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
-; GFX8-LABEL: flat_system_atomic_fadd_noret_f16__offset12b_pos:
+; GFX8-LABEL: flat_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fe, v0
-; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT: v_mov_b32_e32 v3, v0
; GFX8-NEXT: v_and_b32_e32 v0, -4, v3
; GFX8-NEXT: flat_load_dword v4, v[0:1]
; GFX8-NEXT: v_and_b32_e32 v3, 3, v3
@@ -6935,13 +12272,20 @@ define void @flat_system_atomic_fadd_noret_f16__offset12b_pos(ptr %ptr, half %va
; GFX8-NEXT: v_lshlrev_b32_e64 v3, v5, s4
; GFX8-NEXT: v_not_b32_e32 v6, v3
; GFX8-NEXT: s_mov_b64 s[4:5], 0
-; GFX8-NEXT: .LBB31_1: ; %atomicrmw.start
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX8-NEXT: .LBB53_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_lshrrev_b32_e32 v3, v5, v4
-; GFX8-NEXT: v_add_f16_e32 v3, v3, v2
+; GFX8-NEXT: v_lshrrev_b32_sdwa v3, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_add_f32_e32 v3, v3, v2
+; GFX8-NEXT: v_bfe_u32 v8, v3, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v3
+; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8
+; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; GFX8-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc
; GFX8-NEXT: v_and_b32_e32 v7, v4, v6
-; GFX8-NEXT: v_lshlrev_b32_e32 v3, v5, v3
+; GFX8-NEXT: v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX8-NEXT: v_or_b32_e32 v3, v7, v3
; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -6950,74 +12294,70 @@ define void @flat_system_atomic_fadd_noret_f16__offset12b_pos(ptr %ptr, half %va
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX8-NEXT: v_mov_b32_e32 v4, v3
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT: s_cbranch_execnz .LBB31_1
+; GFX8-NEXT: s_cbranch_execnz .LBB53_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX7-LABEL: flat_system_atomic_fadd_noret_f16__offset12b_pos:
+; GFX7-LABEL: flat_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0
-; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX7-NEXT: v_and_b32_e32 v0, -4, v4
-; GFX7-NEXT: flat_load_dword v3, v[0:1]
-; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX7-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX7-NEXT: v_mov_b32_e32 v3, v0
+; GFX7-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX7-NEXT: flat_load_dword v4, v[0:1]
+; GFX7-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX7-NEXT: v_lshlrev_b32_e32 v5, 3, v3
+; GFX7-NEXT: v_lshl_b32_e32 v3, 0xffff, v5
+; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT: v_not_b32_e32 v6, v3
; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2
-; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v4
-; GFX7-NEXT: v_not_b32_e32 v6, v2
-; GFX7-NEXT: .LBB31_1: ; %atomicrmw.start
+; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT: .LBB53_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3
-; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
-; GFX7-NEXT: v_and_b32_e32 v7, v3, v6
-; GFX7-NEXT: v_add_f32_e32 v2, v2, v5
-; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX7-NEXT: v_lshlrev_b32_e32 v2, v4, v2
-; GFX7-NEXT: v_or_b32_e32 v2, v7, v2
-; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, v5, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_add_f32_e32 v3, v3, v2
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_and_b32_e32 v7, v4, v6
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, v5, v3
+; GFX7-NEXT: v_or_b32_e32 v3, v7, v3
+; GFX7-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT: v_mov_b32_e32 v3, v2
+; GFX7-NEXT: v_mov_b32_e32 v4, v3
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT: s_cbranch_execnz .LBB31_1
+; GFX7-NEXT: s_cbranch_execnz .LBB53_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_setpc_b64 s[30:31]
- %gep = getelementptr half, ptr %ptr, i64 1023
- %unused = atomicrmw fadd ptr %gep, half %val seq_cst
+ %unused = atomicrmw fadd ptr %ptr, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
ret void
}
-; --------------------------------------------------------------------
-; bfloat
-; --------------------------------------------------------------------
-
-define bfloat @flat_agent_atomic_fadd_ret_bf16(ptr %ptr, bfloat %val) #0 {
-; GFX12-LABEL: flat_agent_atomic_fadd_ret_bf16:
+define bfloat @flat_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr %ptr, bfloat %val) #0 {
+; GFX12-LABEL: flat_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: s_wait_expcnt 0x0
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX12-NEXT: s_mov_b32 s0, 0
; GFX12-NEXT: flat_load_b32 v5, v[0:1]
; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3
; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_not_b32_e32 v4, v4
-; GFX12-NEXT: .LBB32_1: ; %atomicrmw.start
+; GFX12-NEXT: .LBB54_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: v_mov_b32_e32 v6, v5
@@ -7040,24 +12380,26 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16(ptr %ptr, bfloat %val) #0 {
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: global_inv scope:SCOPE_SYS
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB32_1
+; GFX12-NEXT: s_cbranch_execnz .LBB54_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
-; GFX940-LABEL: flat_agent_atomic_fadd_ret_bf16:
+; GFX940-LABEL: flat_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_mov_b32_e32 v3, v0
-; GFX940-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX940-NEXT: s_mov_b64 s[0:1], 0x7fe
+; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
+; GFX940-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX940-NEXT: v_mov_b32_e32 v1, v5
; GFX940-NEXT: flat_load_dword v5, v[0:1]
-; GFX940-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX940-NEXT: v_and_b32_e32 v3, 3, v4
; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3
; GFX940-NEXT: s_mov_b32 s0, 0xffff
; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0
@@ -7065,7 +12407,7 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16(ptr %ptr, bfloat %val) #0 {
; GFX940-NEXT: s_mov_b64 s[0:1], 0
; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX940-NEXT: s_movk_i32 s2, 0x7fff
-; GFX940-NEXT: .LBB32_1: ; %atomicrmw.start
+; GFX940-NEXT: .LBB54_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: v_mov_b32_e32 v7, v5
@@ -7080,34 +12422,36 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16(ptr %ptr, bfloat %val) #0 {
; GFX940-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc
; GFX940-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX940-NEXT: v_and_or_b32 v6, v7, v4, v5
-; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: flat_atomic_cmpswap v5, v[0:1], v[6:7] sc0
+; GFX940-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-NEXT: flat_atomic_cmpswap v5, v[0:1], v[6:7] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT: buffer_inv sc1
+; GFX940-NEXT: buffer_inv sc0 sc1
; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7
; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT: s_cbranch_execnz .LBB32_1
+; GFX940-NEXT: s_cbranch_execnz .LBB54_1
; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX940-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v5
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: flat_agent_atomic_fadd_ret_bf16:
+; GFX11-LABEL: flat_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_and_b32_e32 v0, -4, v3
; GFX11-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: flat_load_b32 v5, v[0:1]
; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3
; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_not_b32_e32 v4, v4
; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB32_1: ; %atomicrmw.start
+; GFX11-NEXT: .LBB54_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v6, v5
@@ -7136,25 +12480,26 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16(ptr %ptr, bfloat %val) #0 {
; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB32_1
+; GFX11-NEXT: s_cbranch_execnz .LBB54_1
; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
-; GFX10-LABEL: flat_agent_atomic_fadd_ret_bf16:
+; GFX10-LABEL: flat_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v3, v0
+; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX10-NEXT: s_mov_b32 s4, 0
; GFX10-NEXT: v_and_b32_e32 v0, -4, v3
; GFX10-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX10-NEXT: s_mov_b32 s4, 0
; GFX10-NEXT: flat_load_dword v5, v[0:1]
; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3
; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
; GFX10-NEXT: v_not_b32_e32 v4, v4
-; GFX10-NEXT: .LBB32_1: ; %atomicrmw.start
+; GFX10-NEXT: .LBB54_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v6, v5
@@ -7175,16 +12520,17 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16(ptr %ptr, bfloat %val) #0 {
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: s_cbranch_execnz .LBB32_1
+; GFX10-NEXT: s_cbranch_execnz .LBB54_1
; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: v_lshrrev_b32_e32 v0, v3, v5
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX90A-LABEL: flat_agent_atomic_fadd_ret_bf16:
+; GFX90A-LABEL: flat_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v3, v0
+; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0
+; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3
; GFX90A-NEXT: flat_load_dword v5, v[0:1]
; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3
@@ -7195,7 +12541,7 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16(ptr %ptr, bfloat %val) #0 {
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX90A-NEXT: s_movk_i32 s6, 0x7fff
-; GFX90A-NEXT: .LBB32_1: ; %atomicrmw.start
+; GFX90A-NEXT: .LBB54_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v7, v5
@@ -7208,22 +12554,25 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16(ptr %ptr, bfloat %val) #0 {
; GFX90A-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc
; GFX90A-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
; GFX90A-NEXT: v_and_or_b32 v6, v7, v4, v5
+; GFX90A-NEXT: buffer_wbl2
; GFX90A-NEXT: flat_atomic_cmpswap v5, v[0:1], v[6:7] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: buffer_invl2
; GFX90A-NEXT: buffer_wbinvl1
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: s_cbranch_execnz .LBB32_1
+; GFX90A-NEXT: s_cbranch_execnz .LBB54_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
-; GFX908-LABEL: flat_agent_atomic_fadd_ret_bf16:
+; GFX908-LABEL: flat_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v3, v0
+; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0
+; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
; GFX908-NEXT: v_and_b32_e32 v0, -4, v3
; GFX908-NEXT: flat_load_dword v5, v[0:1]
; GFX908-NEXT: v_and_b32_e32 v3, 3, v3
@@ -7234,7 +12583,7 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16(ptr %ptr, bfloat %val) #0 {
; GFX908-NEXT: s_mov_b64 s[4:5], 0
; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX908-NEXT: s_movk_i32 s6, 0x7fff
-; GFX908-NEXT: .LBB32_1: ; %atomicrmw.start
+; GFX908-NEXT: .LBB54_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX908-NEXT: v_mov_b32_e32 v6, v5
@@ -7253,16 +12602,17 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16(ptr %ptr, bfloat %val) #0 {
; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT: s_cbranch_execnz .LBB32_1
+; GFX908-NEXT: s_cbranch_execnz .LBB54_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX908-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v5
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
-; GFX8-LABEL: flat_agent_atomic_fadd_ret_bf16:
+; GFX8-LABEL: flat_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v3, v0
+; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fe, v0
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: v_and_b32_e32 v0, -4, v3
; GFX8-NEXT: flat_load_dword v5, v[0:1]
; GFX8-NEXT: v_and_b32_e32 v3, 3, v3
@@ -7272,7 +12622,7 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16(ptr %ptr, bfloat %val) #0 {
; GFX8-NEXT: v_not_b32_e32 v4, v4
; GFX8-NEXT: s_mov_b64 s[4:5], 0
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX8-NEXT: .LBB32_1: ; %atomicrmw.start
+; GFX8-NEXT: .LBB54_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v6, v5
@@ -7293,16 +12643,17 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16(ptr %ptr, bfloat %val) #0 {
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT: s_cbranch_execnz .LBB32_1
+; GFX8-NEXT: s_cbranch_execnz .LBB54_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX8-NEXT: v_lshrrev_b32_e32 v0, v3, v5
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX7-LABEL: flat_agent_atomic_fadd_ret_bf16:
+; GFX7-LABEL: flat_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v3, v0
+; GFX7-NEXT: v_add_i32_e32 v3, vcc, 0x7fe, v0
+; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX7-NEXT: v_and_b32_e32 v0, -4, v3
; GFX7-NEXT: flat_load_dword v5, v[0:1]
; GFX7-NEXT: v_and_b32_e32 v3, 3, v3
@@ -7312,7 +12663,7 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16(ptr %ptr, bfloat %val) #0 {
; GFX7-NEXT: v_not_b32_e32 v4, v4
; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX7-NEXT: .LBB32_1: ; %atomicrmw.start
+; GFX7-NEXT: .LBB54_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v6, v5
@@ -7329,2994 +12680,1893 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16(ptr %ptr, bfloat %val) #0 {
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT: s_cbranch_execnz .LBB32_1
+; GFX7-NEXT: s_cbranch_execnz .LBB54_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX7-NEXT: v_lshrrev_b32_e32 v0, v3, v5
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
- %result = atomicrmw fadd ptr %ptr, bfloat %val syncscope("agent") seq_cst
+ %gep = getelementptr bfloat, ptr %ptr, i64 1023
+ %result = atomicrmw fadd ptr %gep, bfloat %val seq_cst, !amdgpu.no.fine.grained.memory !0
ret bfloat %result
}
-define bfloat @flat_agent_atomic_fadd_ret_bf16__offset12b_pos(ptr %ptr, bfloat %val) #0 {
-; GFX12-LABEL: flat_agent_atomic_fadd_ret_bf16__offset12b_pos:
+define void @flat_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr %ptr, bfloat %val) #0 {
+; GFX12-LABEL: flat_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: s_wait_expcnt 0x0
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
-; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX12-NEXT: v_lshlrev_b32_e32 v6, 16, v2
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX12-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX12-NEXT: v_and_b32_e32 v4, 3, v4
; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: flat_load_b32 v5, v[0:1]
-; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3
-; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX12-NEXT: flat_load_b32 v3, v[0:1]
+; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_not_b32_e32 v4, v4
-; GFX12-NEXT: .LBB33_1: ; %atomicrmw.start
+; GFX12-NEXT: v_not_b32_e32 v5, v5
+; GFX12-NEXT: .LBB55_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v6, v5
+; GFX12-NEXT: v_lshrrev_b32_e32 v2, v4, v3
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6
-; GFX12-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX12-NEXT: v_add_f32_e32 v2, v2, v6
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_add_f32_e32 v5, v5, v2
-; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v5
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
+; GFX12-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo
+; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v2
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2
+; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN
+; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-NEXT: global_inv scope:SCOPE_SYS
+; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-NEXT: v_mov_b32_e32 v3, v2
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB33_1
+; GFX12-NEXT: s_cbranch_execnz .LBB55_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
-; GFX940-LABEL: flat_agent_atomic_fadd_ret_bf16__offset12b_pos:
+; GFX940-LABEL: flat_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: s_mov_b64 s[0:1], 0x7fe
; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
; GFX940-NEXT: v_and_b32_e32 v0, -4, v4
; GFX940-NEXT: v_mov_b32_e32 v1, v5
-; GFX940-NEXT: flat_load_dword v5, v[0:1]
-; GFX940-NEXT: v_and_b32_e32 v3, 3, v4
-; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX940-NEXT: flat_load_dword v3, v[0:1]
+; GFX940-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX940-NEXT: v_lshlrev_b32_e32 v4, 3, v4
; GFX940-NEXT: s_mov_b32 s0, 0xffff
-; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0
-; GFX940-NEXT: v_not_b32_e32 v4, v4
+; GFX940-NEXT: v_lshlrev_b32_e64 v5, v4, s0
+; GFX940-NEXT: v_not_b32_e32 v5, v5
; GFX940-NEXT: s_mov_b64 s[0:1], 0
-; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX940-NEXT: v_lshlrev_b32_e32 v6, 16, v2
; GFX940-NEXT: s_movk_i32 s2, 0x7fff
-; GFX940-NEXT: .LBB33_1: ; %atomicrmw.start
+; GFX940-NEXT: .LBB55_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_mov_b32_e32 v7, v5
-; GFX940-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX940-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_add_f32_e32 v5, v5, v2
-; GFX940-NEXT: v_bfe_u32 v6, v5, 16, 1
-; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v5
-; GFX940-NEXT: v_add3_u32 v6, v6, v5, s2
-; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX940-NEXT: v_add_f32_e32 v2, v2, v6
+; GFX940-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX940-NEXT: v_add3_u32 v7, v7, v2, s2
+; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
; GFX940-NEXT: s_nop 1
-; GFX940-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc
-; GFX940-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX940-NEXT: v_and_or_b32 v6, v7, v4, v5
-; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: flat_atomic_cmpswap v5, v[0:1], v[6:7] sc0
+; GFX940-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc
+; GFX940-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX940-NEXT: v_and_or_b32 v2, v3, v5, v2
+; GFX940-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7
+; GFX940-NEXT: buffer_inv sc0 sc1
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX940-NEXT: v_mov_b32_e32 v3, v2
; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT: s_cbranch_execnz .LBB33_1
+; GFX940-NEXT: s_cbranch_execnz .LBB55_1
; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX940-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v5
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: flat_agent_atomic_fadd_ret_bf16__offset12b_pos:
+; GFX11-LABEL: flat_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX11-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX11-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX11-NEXT: v_and_b32_e32 v4, 3, v4
; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: flat_load_b32 v5, v[0:1]
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3
-; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX11-NEXT: flat_load_b32 v3, v[0:1]
+; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX11-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_not_b32_e32 v4, v4
+; GFX11-NEXT: v_not_b32_e32 v5, v5
; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB33_1: ; %atomicrmw.start
+; GFX11-NEXT: .LBB55_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v6, v5
+; GFX11-NEXT: v_lshrrev_b32_e32 v2, v4, v3
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6
-; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-NEXT: v_add_f32_e32 v2, v2, v6
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_f32_e32 v5, v5, v2
-; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v5
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
+; GFX11-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo
+; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX11-NEXT: v_lshlrev_b32_e32 v2, v4, v2
+; GFX11-NEXT: v_and_or_b32 v2, v3, v5, v2
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc
+; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-NEXT: v_mov_b32_e32 v3, v2
; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB33_1
+; GFX11-NEXT: s_cbranch_execnz .LBB55_1
; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
-; GFX10-LABEL: flat_agent_atomic_fadd_ret_bf16__offset12b_pos:
+; GFX10-LABEL: flat_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
-; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX10-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX10-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v2
+; GFX10-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX10-NEXT: v_and_b32_e32 v4, 3, v4
; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: flat_load_dword v5, v[0:1]
-; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3
-; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX10-NEXT: v_not_b32_e32 v4, v4
-; GFX10-NEXT: .LBB33_1: ; %atomicrmw.start
+; GFX10-NEXT: flat_load_dword v3, v[0:1]
+; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX10-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
+; GFX10-NEXT: v_not_b32_e32 v5, v5
+; GFX10-NEXT: .LBB55_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v6, v5
-; GFX10-NEXT: v_lshrrev_b32_sdwa v5, v3, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX10-NEXT: v_add_f32_e32 v5, v5, v2
-; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v5
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
-; GFX10-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX10-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX10-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX10-NEXT: v_add_f32_e32 v2, v2, v6
+; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX10-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX10-NEXT: v_and_or_b32 v2, v3, v5, v2
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
+; GFX10-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-NEXT: v_mov_b32_e32 v3, v2
; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: s_cbranch_execnz .LBB33_1
+; GFX10-NEXT: s_cbranch_execnz .LBB55_1
; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: v_lshrrev_b32_e32 v0, v3, v5
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX90A-LABEL: flat_agent_atomic_fadd_ret_bf16__offset12b_pos:
+; GFX90A-LABEL: flat_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0
+; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0
; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX90A-NEXT: flat_load_dword v5, v[0:1]
-; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX90A-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX90A-NEXT: flat_load_dword v3, v[0:1]
+; GFX90A-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 3, v4
; GFX90A-NEXT: s_mov_b32 s4, 0xffff
-; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4
-; GFX90A-NEXT: v_not_b32_e32 v4, v4
+; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v4, s4
+; GFX90A-NEXT: v_not_b32_e32 v5, v5
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v2
; GFX90A-NEXT: s_movk_i32 s6, 0x7fff
-; GFX90A-NEXT: .LBB33_1: ; %atomicrmw.start
+; GFX90A-NEXT: .LBB55_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v7, v5
-; GFX90A-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX90A-NEXT: v_add_f32_e32 v5, v5, v2
-; GFX90A-NEXT: v_bfe_u32 v6, v5, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v5
-; GFX90A-NEXT: v_add3_u32 v6, v6, v5, s6
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX90A-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc
-; GFX90A-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX90A-NEXT: v_and_or_b32 v6, v7, v4, v5
-; GFX90A-NEXT: flat_atomic_cmpswap v5, v[0:1], v[6:7] glc
+; GFX90A-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX90A-NEXT: v_add_f32_e32 v2, v2, v6
+; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s6
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; GFX90A-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc
+; GFX90A-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX90A-NEXT: v_and_or_b32 v2, v3, v5, v2
+; GFX90A-NEXT: buffer_wbl2
+; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: buffer_invl2
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: s_cbranch_execnz .LBB33_1
+; GFX90A-NEXT: s_cbranch_execnz .LBB55_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
-; GFX908-LABEL: flat_agent_atomic_fadd_ret_bf16__offset12b_pos:
+; GFX908-LABEL: flat_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0
+; GFX908-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0
; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX908-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX908-NEXT: flat_load_dword v5, v[0:1]
-; GFX908-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX908-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX908-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX908-NEXT: flat_load_dword v3, v[0:1]
+; GFX908-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX908-NEXT: v_lshlrev_b32_e32 v4, 3, v4
; GFX908-NEXT: s_mov_b32 s4, 0xffff
-; GFX908-NEXT: v_lshlrev_b32_e64 v4, v3, s4
-; GFX908-NEXT: v_not_b32_e32 v4, v4
+; GFX908-NEXT: v_lshlrev_b32_e64 v5, v4, s4
+; GFX908-NEXT: v_not_b32_e32 v5, v5
; GFX908-NEXT: s_mov_b64 s[4:5], 0
-; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX908-NEXT: v_lshlrev_b32_e32 v6, 16, v2
; GFX908-NEXT: s_movk_i32 s6, 0x7fff
-; GFX908-NEXT: .LBB33_1: ; %atomicrmw.start
+; GFX908-NEXT: .LBB55_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v6, v5
-; GFX908-NEXT: v_lshrrev_b32_sdwa v5, v3, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX908-NEXT: v_add_f32_e32 v5, v5, v2
-; GFX908-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v5
-; GFX908-NEXT: v_add3_u32 v7, v7, v5, s6
-; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX908-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc
-; GFX908-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX908-NEXT: v_and_or_b32 v5, v6, v4, v5
-; GFX908-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
+; GFX908-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX908-NEXT: v_add_f32_e32 v2, v2, v6
+; GFX908-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX908-NEXT: v_add3_u32 v7, v7, v2, s6
+; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; GFX908-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc
+; GFX908-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX908-NEXT: v_and_or_b32 v2, v3, v5, v2
+; GFX908-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT: v_mov_b32_e32 v3, v2
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT: s_cbranch_execnz .LBB33_1
+; GFX908-NEXT: s_cbranch_execnz .LBB55_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX908-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v5
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
-; GFX8-LABEL: flat_agent_atomic_fadd_ret_bf16__offset12b_pos:
+; GFX8-LABEL: flat_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fe, v0
+; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fe, v0
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX8-NEXT: flat_load_dword v5, v[0:1]
-; GFX8-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX8-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX8-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX8-NEXT: flat_load_dword v3, v[0:1]
+; GFX8-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX8-NEXT: v_lshlrev_b32_e32 v4, 3, v4
; GFX8-NEXT: s_mov_b32 s4, 0xffff
-; GFX8-NEXT: v_lshlrev_b32_e64 v4, v3, s4
-; GFX8-NEXT: v_not_b32_e32 v4, v4
+; GFX8-NEXT: v_lshlrev_b32_e64 v5, v4, s4
+; GFX8-NEXT: v_not_b32_e32 v5, v5
; GFX8-NEXT: s_mov_b64 s[4:5], 0
-; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX8-NEXT: .LBB33_1: ; %atomicrmw.start
+; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v2
+; GFX8-NEXT: .LBB55_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v6, v5
-; GFX8-NEXT: v_lshrrev_b32_sdwa v5, v3, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT: v_add_f32_e32 v5, v5, v2
-; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5
+; GFX8-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_add_f32_e32 v2, v2, v6
+; GFX8-NEXT: v_bfe_u32 v8, v2, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v2
; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8
-; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX8-NEXT: v_and_b32_e32 v7, v6, v4
-; GFX8-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-NEXT: v_or_b32_e32 v5, v7, v5
-; GFX8-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
+; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; GFX8-NEXT: v_cndmask_b32_e32 v2, v8, v9, vcc
+; GFX8-NEXT: v_and_b32_e32 v7, v3, v5
+; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT: v_or_b32_e32 v2, v7, v2
+; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT: v_mov_b32_e32 v3, v2
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT: s_cbranch_execnz .LBB33_1
+; GFX8-NEXT: s_cbranch_execnz .LBB55_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, v3, v5
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX7-LABEL: flat_agent_atomic_fadd_ret_bf16__offset12b_pos:
+; GFX7-LABEL: flat_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_add_i32_e32 v3, vcc, 0x7fe, v0
+; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0
; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX7-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX7-NEXT: flat_load_dword v5, v[0:1]
-; GFX7-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX7-NEXT: v_lshlrev_b32_e32 v3, 3, v3
-; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v3
+; GFX7-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX7-NEXT: flat_load_dword v3, v[0:1]
+; GFX7-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX7-NEXT: v_lshl_b32_e32 v5, 0xffff, v4
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT: v_not_b32_e32 v4, v4
+; GFX7-NEXT: v_not_b32_e32 v5, v5
; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX7-NEXT: .LBB33_1: ; %atomicrmw.start
+; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
+; GFX7-NEXT: .LBB55_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v6, v5
-; GFX7-NEXT: v_lshrrev_b32_e32 v5, v3, v6
-; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX7-NEXT: v_add_f32_e32 v5, v5, v2
-; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX7-NEXT: v_and_b32_e32 v7, v6, v4
-; GFX7-NEXT: v_lshlrev_b32_e32 v5, v3, v5
-; GFX7-NEXT: v_or_b32_e32 v5, v7, v5
-; GFX7-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
+; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX7-NEXT: v_add_f32_e32 v2, v2, v6
+; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX7-NEXT: v_and_b32_e32 v7, v3, v5
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, v4, v2
+; GFX7-NEXT: v_or_b32_e32 v2, v7, v2
+; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT: v_mov_b32_e32 v3, v2
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT: s_cbranch_execnz .LBB33_1
+; GFX7-NEXT: s_cbranch_execnz .LBB55_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, v3, v5
-; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr bfloat, ptr %ptr, i64 1023
- %result = atomicrmw fadd ptr %gep, bfloat %val syncscope("agent") seq_cst
- ret bfloat %result
+ %unused = atomicrmw fadd ptr %gep, bfloat %val seq_cst, !amdgpu.no.fine.grained.memory !0
+ ret void
}
-define bfloat @flat_agent_atomic_fadd_ret_bf16__offset12b_neg(ptr %ptr, bfloat %val) #0 {
-; GFX12-LABEL: flat_agent_atomic_fadd_ret_bf16__offset12b_neg:
+; --------------------------------------------------------------------
+; <2 x half>
+; --------------------------------------------------------------------
+
+define <2 x half> @flat_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memory(ptr %ptr, <2 x half> %val) #0 {
+; GFX12-LABEL: flat_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memory:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: s_wait_expcnt 0x0
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
-; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
-; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: flat_load_b32 v5, v[0:1]
-; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3
-; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_not_b32_e32 v4, v4
-; GFX12-NEXT: .LBB34_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v6, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6
-; GFX12-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_add_f32_e32 v5, v5, v2
-; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v5
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN
+; GFX12-NEXT: flat_atomic_pk_add_f16 v0, v[0:1], v2 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB34_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
-; GFX940-LABEL: flat_agent_atomic_fadd_ret_bf16__offset12b_neg:
+; GFX940-LABEL: flat_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memory:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: s_movk_i32 s0, 0xf800
-; GFX940-NEXT: s_mov_b32 s1, -1
-; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
-; GFX940-NEXT: v_and_b32_e32 v0, -4, v4
-; GFX940-NEXT: v_mov_b32_e32 v1, v5
-; GFX940-NEXT: flat_load_dword v5, v[0:1]
-; GFX940-NEXT: v_and_b32_e32 v3, 3, v4
-; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3
-; GFX940-NEXT: s_mov_b32 s0, 0xffff
-; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0
-; GFX940-NEXT: v_not_b32_e32 v4, v4
-; GFX940-NEXT: s_mov_b64 s[0:1], 0
-; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX940-NEXT: s_movk_i32 s2, 0x7fff
-; GFX940-NEXT: .LBB34_1: ; %atomicrmw.start
-; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_mov_b32_e32 v7, v5
-; GFX940-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_add_f32_e32 v5, v5, v2
-; GFX940-NEXT: v_bfe_u32 v6, v5, 16, 1
-; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v5
-; GFX940-NEXT: v_add3_u32 v6, v6, v5, s2
-; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX940-NEXT: s_nop 1
-; GFX940-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc
-; GFX940-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX940-NEXT: v_and_or_b32 v6, v7, v4, v5
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: flat_atomic_cmpswap v5, v[0:1], v[6:7] sc0
+; GFX940-NEXT: flat_atomic_pk_add_f16 v0, v[0:1], v2 sc0
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7
-; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT: s_cbranch_execnz .LBB34_1
-; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX940-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v5
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: flat_agent_atomic_fadd_ret_bf16__offset12b_neg:
+; GFX11-LABEL: flat_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX11-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX11-NEXT: flat_load_b32 v3, v[0:1]
; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: flat_load_b32 v5, v[0:1]
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3
-; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_not_b32_e32 v4, v4
-; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB34_1: ; %atomicrmw.start
+; GFX11-NEXT: .LBB56_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v6, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6
-; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_f32_e32 v5, v5, v2
-; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v5
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX11-NEXT: v_mov_b32_e32 v4, v3
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX11-NEXT: v_pk_add_f16 v3, v4, v2
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc
+; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB34_1
+; GFX11-NEXT: s_cbranch_execnz .LBB56_1
; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX11-NEXT: v_mov_b32_e32 v0, v3
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
-; GFX10-LABEL: flat_agent_atomic_fadd_ret_bf16__offset12b_neg:
+; GFX10-LABEL: flat_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
-; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
-; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX10-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX10-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX10-NEXT: flat_load_dword v3, v[0:1]
; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: flat_load_dword v5, v[0:1]
-; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3
-; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX10-NEXT: v_not_b32_e32 v4, v4
-; GFX10-NEXT: .LBB34_1: ; %atomicrmw.start
+; GFX10-NEXT: .LBB56_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v6, v5
-; GFX10-NEXT: v_lshrrev_b32_sdwa v5, v3, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX10-NEXT: v_add_f32_e32 v5, v5, v2
-; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v5
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
-; GFX10-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX10-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX10-NEXT: v_mov_b32_e32 v4, v3
+; GFX10-NEXT: v_pk_add_f16 v3, v4, v2
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
+; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: s_cbranch_execnz .LBB34_1
+; GFX10-NEXT: s_cbranch_execnz .LBB56_1
; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX10-NEXT: v_mov_b32_e32 v0, v3
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX90A-LABEL: flat_agent_atomic_fadd_ret_bf16__offset12b_neg:
+; GFX90A-LABEL: flat_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
-; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
-; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX90A-NEXT: flat_load_dword v5, v[0:1]
-; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3
-; GFX90A-NEXT: s_mov_b32 s4, 0xffff
-; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4
-; GFX90A-NEXT: v_not_b32_e32 v4, v4
+; GFX90A-NEXT: flat_load_dword v3, v[0:1]
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX90A-NEXT: s_movk_i32 s6, 0x7fff
-; GFX90A-NEXT: .LBB34_1: ; %atomicrmw.start
+; GFX90A-NEXT: .LBB56_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v7, v5
-; GFX90A-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX90A-NEXT: v_add_f32_e32 v5, v5, v2
-; GFX90A-NEXT: v_bfe_u32 v6, v5, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v5
-; GFX90A-NEXT: v_add3_u32 v6, v6, v5, s6
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX90A-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc
-; GFX90A-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX90A-NEXT: v_and_or_b32 v6, v7, v4, v5
-; GFX90A-NEXT: flat_atomic_cmpswap v5, v[0:1], v[6:7] glc
+; GFX90A-NEXT: v_mov_b32_e32 v5, v3
+; GFX90A-NEXT: v_pk_add_f16 v4, v5, v2
+; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: s_cbranch_execnz .LBB34_1
-; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5
-; GFX90A-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX908-LABEL: flat_agent_atomic_fadd_ret_bf16__offset12b_neg:
-; GFX908: ; %bb.0:
-; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
-; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
-; GFX908-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX908-NEXT: flat_load_dword v5, v[0:1]
-; GFX908-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX908-NEXT: v_lshlrev_b32_e32 v3, 3, v3
-; GFX908-NEXT: s_mov_b32 s4, 0xffff
-; GFX908-NEXT: v_lshlrev_b32_e64 v4, v3, s4
-; GFX908-NEXT: v_not_b32_e32 v4, v4
+; GFX90A-NEXT: s_cbranch_execnz .LBB56_1
+; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v3
+; GFX90A-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: flat_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memory:
+; GFX908: ; %bb.0:
+; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT: flat_load_dword v3, v[0:1]
; GFX908-NEXT: s_mov_b64 s[4:5], 0
-; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX908-NEXT: s_movk_i32 s6, 0x7fff
-; GFX908-NEXT: .LBB34_1: ; %atomicrmw.start
+; GFX908-NEXT: .LBB56_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v6, v5
-; GFX908-NEXT: v_lshrrev_b32_sdwa v5, v3, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX908-NEXT: v_add_f32_e32 v5, v5, v2
-; GFX908-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v5
-; GFX908-NEXT: v_add3_u32 v7, v7, v5, s6
-; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX908-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc
-; GFX908-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX908-NEXT: v_and_or_b32 v5, v6, v4, v5
-; GFX908-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
+; GFX908-NEXT: v_mov_b32_e32 v4, v3
+; GFX908-NEXT: v_pk_add_f16 v3, v4, v2
+; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT: s_cbranch_execnz .LBB34_1
+; GFX908-NEXT: s_cbranch_execnz .LBB56_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX908-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX908-NEXT: v_mov_b32_e32 v0, v3
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
-; GFX8-LABEL: flat_agent_atomic_fadd_ret_bf16__offset12b_neg:
+; GFX8-LABEL: flat_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memory:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0xfffff800, v0
-; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc
-; GFX8-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX8-NEXT: flat_load_dword v5, v[0:1]
-; GFX8-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX8-NEXT: v_lshlrev_b32_e32 v3, 3, v3
-; GFX8-NEXT: s_mov_b32 s4, 0xffff
-; GFX8-NEXT: v_lshlrev_b32_e64 v4, v3, s4
-; GFX8-NEXT: v_not_b32_e32 v4, v4
+; GFX8-NEXT: flat_load_dword v3, v[0:1]
; GFX8-NEXT: s_mov_b64 s[4:5], 0
-; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX8-NEXT: .LBB34_1: ; %atomicrmw.start
+; GFX8-NEXT: .LBB56_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v6, v5
-; GFX8-NEXT: v_lshrrev_b32_sdwa v5, v3, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT: v_add_f32_e32 v5, v5, v2
-; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5
-; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8
-; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX8-NEXT: v_and_b32_e32 v7, v6, v4
-; GFX8-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-NEXT: v_or_b32_e32 v5, v7, v5
-; GFX8-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
+; GFX8-NEXT: v_mov_b32_e32 v4, v3
+; GFX8-NEXT: v_add_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_add_f16_e32 v5, v4, v2
+; GFX8-NEXT: v_or_b32_e32 v3, v5, v3
+; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT: s_cbranch_execnz .LBB34_1
+; GFX8-NEXT: s_cbranch_execnz .LBB56_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX8-NEXT: v_mov_b32_e32 v0, v3
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX7-LABEL: flat_agent_atomic_fadd_ret_bf16__offset12b_neg:
+; GFX7-LABEL: flat_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memory:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_add_i32_e32 v3, vcc, 0xfffff800, v0
-; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc
-; GFX7-NEXT: v_and_b32_e32 v0, -4, v3
; GFX7-NEXT: flat_load_dword v5, v[0:1]
-; GFX7-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX7-NEXT: v_lshlrev_b32_e32 v3, 3, v3
-; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v3
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT: v_not_b32_e32 v4, v4
+; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2
; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX7-NEXT: .LBB34_1: ; %atomicrmw.start
-; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v6, v5
-; GFX7-NEXT: v_lshrrev_b32_e32 v5, v3, v6
-; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX7-NEXT: v_add_f32_e32 v5, v5, v2
-; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX7-NEXT: v_and_b32_e32 v7, v6, v4
-; GFX7-NEXT: v_lshlrev_b32_e32 v5, v3, v5
-; GFX7-NEXT: v_or_b32_e32 v5, v7, v5
-; GFX7-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v5
+; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v5
+; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3
+; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v6
+; GFX7-NEXT: .LBB56_1: ; %atomicrmw.start
+; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v3
+; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v2
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_add_f32_e32 v6, v6, v4
+; GFX7-NEXT: v_add_f32_e32 v7, v7, v5
+; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6
+; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7
+; GFX7-NEXT: v_or_b32_e32 v7, v2, v3
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v6
+; GFX7-NEXT: v_or_b32_e32 v6, v8, v2
+; GFX7-NEXT: flat_atomic_cmpswap v6, v[0:1], v[6:7] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v6
+; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v6
+; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v7
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT: s_cbranch_execnz .LBB34_1
+; GFX7-NEXT: s_cbranch_execnz .LBB56_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, v3, v5
-; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_mov_b32_e32 v0, v2
+; GFX7-NEXT: v_mov_b32_e32 v1, v3
; GFX7-NEXT: s_setpc_b64 s[30:31]
- %gep = getelementptr bfloat, ptr %ptr, i64 -1024
- %result = atomicrmw fadd ptr %gep, bfloat %val syncscope("agent") seq_cst
- ret bfloat %result
- }
+ %result = atomicrmw fadd ptr %ptr, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+ ret <2 x half> %result
+}
-define void @flat_agent_atomic_fadd_noret_bf16(ptr %ptr, bfloat %val) #0 {
-; GFX12-LABEL: flat_agent_atomic_fadd_noret_bf16:
+define <2 x half> @flat_agent_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr %ptr, <2 x half> %val) #0 {
+; GFX12-LABEL: flat_agent_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: s_wait_expcnt 0x0
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX12-NEXT: flat_load_b32 v4, v[0:1]
-; GFX12-NEXT: v_lshlrev_b32_e32 v5, 3, v3
-; GFX12-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_not_b32_e32 v6, v3
-; GFX12-NEXT: .LBB35_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_lshrrev_b32_e32 v3, v5, v4
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX12-NEXT: v_add_f32_e32 v3, v3, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-NEXT: v_bfe_u32 v7, v3, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc_lo
-; GFX12-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3
-; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN
+; GFX12-NEXT: flat_atomic_pk_add_f16 v0, v[0:1], v2 offset:2044 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT: v_mov_b32_e32 v4, v3
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB35_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
-; GFX940-LABEL: flat_agent_atomic_fadd_noret_bf16:
+; GFX940-LABEL: flat_agent_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_mov_b32_e32 v3, v0
-; GFX940-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX940-NEXT: flat_load_dword v5, v[0:1]
-; GFX940-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3
-; GFX940-NEXT: s_mov_b32 s0, 0xffff
-; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0
-; GFX940-NEXT: v_not_b32_e32 v6, v4
-; GFX940-NEXT: s_mov_b64 s[0:1], 0
-; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX940-NEXT: s_movk_i32 s2, 0x7fff
-; GFX940-NEXT: .LBB35_1: ; %atomicrmw.start
-; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_lshrrev_b32_sdwa v4, v3, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_add_f32_e32 v4, v4, v2
-; GFX940-NEXT: v_bfe_u32 v7, v4, 16, 1
-; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v4
-; GFX940-NEXT: v_add3_u32 v7, v7, v4, s2
-; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
-; GFX940-NEXT: s_nop 1
-; GFX940-NEXT: v_cndmask_b32_e32 v4, v7, v8, vcc
-; GFX940-NEXT: v_lshlrev_b32_sdwa v4, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX940-NEXT: v_and_or_b32 v4, v5, v6, v4
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] sc0
+; GFX940-NEXT: flat_atomic_pk_add_f16 v0, v[0:1], v2 offset:2044 sc0
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5
-; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT: v_mov_b32_e32 v5, v4
-; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT: s_cbranch_execnz .LBB35_1
-; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX940-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: flat_agent_atomic_fadd_noret_bf16:
+; GFX11-LABEL: flat_agent_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2
+; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2044
; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX11-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX11-NEXT: flat_load_b32 v4, v[0:1]
-; GFX11-NEXT: v_lshlrev_b32_e32 v5, 3, v3
-; GFX11-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_not_b32_e32 v6, v3
-; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB35_1: ; %atomicrmw.start
+; GFX11-NEXT: .LBB57_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, v5, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-NEXT: v_add_f32_e32 v3, v3, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc_lo
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, v5, v3
-; GFX11-NEXT: v_and_or_b32 v3, v4, v6, v3
+; GFX11-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_pk_add_f16 v3, v4, v2
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc
+; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 glc
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT: v_mov_b32_e32 v4, v3
; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB35_1
+; GFX11-NEXT: s_cbranch_execnz .LBB57_1
; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT: v_mov_b32_e32 v0, v3
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
-; GFX10-LABEL: flat_agent_atomic_fadd_noret_bf16:
+; GFX10-LABEL: flat_agent_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v3, v0
-; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX10-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX10-NEXT: flat_load_dword v4, v[0:1]
-; GFX10-NEXT: v_lshlrev_b32_e32 v5, 3, v3
-; GFX10-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
-; GFX10-NEXT: v_not_b32_e32 v6, v3
-; GFX10-NEXT: .LBB35_1: ; %atomicrmw.start
-; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_lshrrev_b32_sdwa v3, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX10-NEXT: v_add_f32_e32 v3, v3, v2
-; GFX10-NEXT: v_bfe_u32 v7, v3, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX10-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
-; GFX10-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc_lo
-; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX10-NEXT: v_and_or_b32 v3, v4, v6, v3
+; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fc, v0
+; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, 0, v1, vcc_lo
+; GFX10-NEXT: s_mov_b32 s4, 0
+; GFX10-NEXT: flat_load_dword v0, v[3:4]
+; GFX10-NEXT: .LBB57_1: ; %atomicrmw.start
+; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_mov_b32_e32 v1, v0
+; GFX10-NEXT: v_pk_add_f16 v0, v1, v2
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX10-NEXT: v_mov_b32_e32 v4, v3
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: s_cbranch_execnz .LBB35_1
+; GFX10-NEXT: s_cbranch_execnz .LBB57_1
; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX90A-LABEL: flat_agent_atomic_fadd_noret_bf16:
+; GFX90A-LABEL: flat_agent_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v3, v0
-; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX90A-NEXT: flat_load_dword v5, v[0:1]
-; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3
-; GFX90A-NEXT: s_mov_b32 s4, 0xffff
-; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4
-; GFX90A-NEXT: v_not_b32_e32 v6, v4
+; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX90A-NEXT: s_movk_i32 s6, 0x7fff
-; GFX90A-NEXT: .LBB35_1: ; %atomicrmw.start
+; GFX90A-NEXT: .LBB57_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_lshrrev_b32_sdwa v4, v3, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX90A-NEXT: v_add_f32_e32 v4, v4, v2
-; GFX90A-NEXT: v_bfe_u32 v7, v4, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v4
-; GFX90A-NEXT: v_add3_u32 v7, v7, v4, s6
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
-; GFX90A-NEXT: v_cndmask_b32_e32 v4, v7, v8, vcc
-; GFX90A-NEXT: v_lshlrev_b32_sdwa v4, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX90A-NEXT: v_and_or_b32 v4, v5, v6, v4
-; GFX90A-NEXT: flat_atomic_cmpswap v4, v[0:1], v[4:5] glc
+; GFX90A-NEXT: v_mov_b32_e32 v5, v3
+; GFX90A-NEXT: v_pk_add_f16 v4, v5, v2
+; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: v_mov_b32_e32 v5, v4
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: s_cbranch_execnz .LBB35_1
+; GFX90A-NEXT: s_cbranch_execnz .LBB57_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v3
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
-; GFX908-LABEL: flat_agent_atomic_fadd_noret_bf16:
+; GFX908-LABEL: flat_agent_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v3, v0
-; GFX908-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX908-NEXT: flat_load_dword v4, v[0:1]
-; GFX908-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX908-NEXT: v_lshlrev_b32_e32 v5, 3, v3
-; GFX908-NEXT: s_mov_b32 s4, 0xffff
-; GFX908-NEXT: v_lshlrev_b32_e64 v3, v5, s4
-; GFX908-NEXT: v_not_b32_e32 v6, v3
+; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044
; GFX908-NEXT: s_mov_b64 s[4:5], 0
-; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX908-NEXT: s_movk_i32 s6, 0x7fff
-; GFX908-NEXT: .LBB35_1: ; %atomicrmw.start
+; GFX908-NEXT: .LBB57_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_lshrrev_b32_sdwa v3, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX908-NEXT: v_add_f32_e32 v3, v3, v2
-; GFX908-NEXT: v_bfe_u32 v7, v3, 16, 1
-; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX908-NEXT: v_add3_u32 v7, v7, v3, s6
-; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; GFX908-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc
-; GFX908-NEXT: v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX908-NEXT: v_and_or_b32 v3, v4, v6, v3
-; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX908-NEXT: v_mov_b32_e32 v4, v3
+; GFX908-NEXT: v_pk_add_f16 v3, v4, v2
+; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT: v_mov_b32_e32 v4, v3
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT: s_cbranch_execnz .LBB35_1
+; GFX908-NEXT: s_cbranch_execnz .LBB57_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX908-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT: v_mov_b32_e32 v0, v3
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
-; GFX8-LABEL: flat_agent_atomic_fadd_noret_bf16:
+; GFX8-LABEL: flat_agent_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v3, v0
-; GFX8-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX8-NEXT: flat_load_dword v4, v[0:1]
-; GFX8-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX8-NEXT: v_lshlrev_b32_e32 v5, 3, v3
-; GFX8-NEXT: s_mov_b32 s4, 0xffff
-; GFX8-NEXT: v_lshlrev_b32_e64 v3, v5, s4
-; GFX8-NEXT: v_not_b32_e32 v6, v3
+; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fc, v0
+; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc
+; GFX8-NEXT: flat_load_dword v0, v[3:4]
; GFX8-NEXT: s_mov_b64 s[4:5], 0
-; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX8-NEXT: .LBB35_1: ; %atomicrmw.start
+; GFX8-NEXT: .LBB57_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_lshrrev_b32_sdwa v3, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT: v_add_f32_e32 v3, v3, v2
-; GFX8-NEXT: v_bfe_u32 v8, v3, 16, 1
-; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v3
-; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8
-; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v3
-; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; GFX8-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc
-; GFX8-NEXT: v_and_b32_e32 v7, v4, v6
-; GFX8-NEXT: v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-NEXT: v_or_b32_e32 v3, v7, v3
-; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT: v_mov_b32_e32 v1, v0
+; GFX8-NEXT: v_add_f16_sdwa v0, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_add_f16_e32 v5, v1, v2
+; GFX8-NEXT: v_or_b32_e32 v0, v5, v0
+; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT: v_mov_b32_e32 v4, v3
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT: s_cbranch_execnz .LBB35_1
+; GFX8-NEXT: s_cbranch_execnz .LBB57_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX7-LABEL: flat_agent_atomic_fadd_noret_bf16:
+; GFX7-LABEL: flat_agent_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v3, v0
-; GFX7-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX7-NEXT: flat_load_dword v4, v[0:1]
-; GFX7-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX7-NEXT: v_lshlrev_b32_e32 v5, 3, v3
-; GFX7-NEXT: v_lshl_b32_e32 v3, 0xffff, v5
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT: v_not_b32_e32 v6, v3
+; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x7fc, v0
+; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
+; GFX7-NEXT: flat_load_dword v1, v[4:5]
+; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v3
+; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v2
; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX7-NEXT: .LBB35_1: ; %atomicrmw.start
-; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v0
+; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v3, v5, v4
-; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX7-NEXT: v_add_f32_e32 v3, v3, v2
-; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX7-NEXT: v_and_b32_e32 v7, v4, v6
-; GFX7-NEXT: v_lshlrev_b32_e32 v3, v5, v3
-; GFX7-NEXT: v_or_b32_e32 v3, v7, v3
-; GFX7-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT: .LBB57_1: ; %atomicrmw.start
+; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v1
+; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_add_f32_e32 v6, v6, v2
+; GFX7-NEXT: v_add_f32_e32 v7, v7, v3
+; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6
+; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7
+; GFX7-NEXT: v_or_b32_e32 v7, v0, v1
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v6
+; GFX7-NEXT: v_or_b32_e32 v6, v8, v0
+; GFX7-NEXT: flat_atomic_cmpswap v6, v[4:5], v[6:7] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v6
+; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v6
+; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v7
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT: v_mov_b32_e32 v4, v3
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT: s_cbranch_execnz .LBB35_1
+; GFX7-NEXT: s_cbranch_execnz .LBB57_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_setpc_b64 s[30:31]
- %unused = atomicrmw fadd ptr %ptr, bfloat %val syncscope("agent") seq_cst
- ret void
+ %gep = getelementptr <2 x half>, ptr %ptr, i64 511
+ %result = atomicrmw fadd ptr %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+ ret <2 x half> %result
}
-define void @flat_agent_atomic_fadd_noret_bf16__offset12b_pos(ptr %ptr, bfloat %val) #0 {
-; GFX12-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b_pos:
+define <2 x half> @flat_agent_atomic_fadd_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr %ptr, <2 x half> %val) #0 {
+; GFX12-LABEL: flat_agent_atomic_fadd_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: s_wait_expcnt 0x0
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
-; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
-; GFX12-NEXT: v_lshlrev_b32_e32 v6, 16, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_b32_e32 v0, -4, v4
-; GFX12-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: flat_load_b32 v3, v[0:1]
-; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4
-; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_not_b32_e32 v5, v5
-; GFX12-NEXT: .LBB36_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_lshrrev_b32_e32 v2, v4, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX12-NEXT: v_add_f32_e32 v2, v2, v6
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo
-; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2
-; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX12-NEXT: flat_atomic_pk_add_f16 v0, v[0:1], v2 offset:-2048 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-NEXT: v_mov_b32_e32 v3, v2
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB36_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
-; GFX940-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b_pos:
+; GFX940-LABEL: flat_agent_atomic_fadd_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: s_mov_b64 s[0:1], 0x7fe
-; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
-; GFX940-NEXT: v_and_b32_e32 v0, -4, v4
-; GFX940-NEXT: v_mov_b32_e32 v1, v5
-; GFX940-NEXT: flat_load_dword v3, v[0:1]
-; GFX940-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX940-NEXT: v_lshlrev_b32_e32 v4, 3, v4
-; GFX940-NEXT: s_mov_b32 s0, 0xffff
-; GFX940-NEXT: v_lshlrev_b32_e64 v5, v4, s0
-; GFX940-NEXT: v_not_b32_e32 v5, v5
-; GFX940-NEXT: s_mov_b64 s[0:1], 0
-; GFX940-NEXT: v_lshlrev_b32_e32 v6, 16, v2
-; GFX940-NEXT: s_movk_i32 s2, 0x7fff
-; GFX940-NEXT: .LBB36_1: ; %atomicrmw.start
-; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_add_f32_e32 v2, v2, v6
-; GFX940-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX940-NEXT: v_add3_u32 v7, v7, v2, s2
-; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; GFX940-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
; GFX940-NEXT: s_nop 1
-; GFX940-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc
-; GFX940-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX940-NEXT: v_and_or_b32 v2, v3, v5, v2
+; GFX940-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0
+; GFX940-NEXT: flat_atomic_pk_add_f16 v0, v[0:1], v2 sc0
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT: v_mov_b32_e32 v3, v2
-; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT: s_cbranch_execnz .LBB36_1
-; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX940-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b_pos:
+; GFX11-LABEL: flat_agent_atomic_fadd_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
-; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v0, -4, v4
-; GFX11-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX11-NEXT: v_mov_b32_e32 v3, v0
; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: flat_load_b32 v3, v[0:1]
-; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v4
-; GFX11-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_not_b32_e32 v5, v5
-; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB36_1: ; %atomicrmw.start
+; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v3
+; GFX11-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, -1, v1, vcc_lo
+; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v3
+; GFX11-NEXT: flat_load_b32 v0, v[4:5]
+; GFX11-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, -1, v1, vcc_lo
+; GFX11-NEXT: .LBB58_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, v4, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-NEXT: v_add_f32_e32 v2, v2, v6
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, v4, v2
-; GFX11-NEXT: v_and_or_b32 v2, v3, v5, v2
+; GFX11-NEXT: v_mov_b32_e32 v1, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_pk_add_f16 v0, v1, v2
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc
+; GFX11-NEXT: flat_atomic_cmpswap_b32 v0, v[3:4], v[0:1] glc
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-NEXT: v_mov_b32_e32 v3, v2
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB36_1
+; GFX11-NEXT: s_cbranch_execnz .LBB58_1
; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
-; GFX10-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b_pos:
+; GFX10-LABEL: flat_agent_atomic_fadd_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
-; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
-; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v2
-; GFX10-NEXT: v_and_b32_e32 v0, -4, v4
-; GFX10-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
+; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, -1, v1, vcc_lo
; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: flat_load_dword v3, v[0:1]
-; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v4
-; GFX10-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX10-NEXT: v_not_b32_e32 v5, v5
-; GFX10-NEXT: .LBB36_1: ; %atomicrmw.start
+; GFX10-NEXT: flat_load_dword v0, v[3:4]
+; GFX10-NEXT: .LBB58_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX10-NEXT: v_add_f32_e32 v2, v2, v6
-; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX10-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
-; GFX10-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo
-; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX10-NEXT: v_and_or_b32 v2, v3, v5, v2
+; GFX10-NEXT: v_mov_b32_e32 v1, v0
+; GFX10-NEXT: v_pk_add_f16 v0, v1, v2
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX10-NEXT: v_mov_b32_e32 v3, v2
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: s_cbranch_execnz .LBB36_1
+; GFX10-NEXT: s_cbranch_execnz .LBB58_1
; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX90A-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b_pos:
+; GFX90A-LABEL: flat_agent_atomic_fadd_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0
-; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX90A-NEXT: v_and_b32_e32 v0, -4, v4
-; GFX90A-NEXT: flat_load_dword v3, v[0:1]
-; GFX90A-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 3, v4
-; GFX90A-NEXT: s_mov_b32 s4, 0xffff
-; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v4, s4
-; GFX90A-NEXT: v_not_b32_e32 v5, v5
+; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
+; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc
+; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
+; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX90A-NEXT: flat_load_dword v0, v[0:1]
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v2
-; GFX90A-NEXT: s_movk_i32 s6, 0x7fff
-; GFX90A-NEXT: .LBB36_1: ; %atomicrmw.start
+; GFX90A-NEXT: .LBB58_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX90A-NEXT: v_add_f32_e32 v2, v2, v6
-; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s6
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX90A-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc
-; GFX90A-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX90A-NEXT: v_and_or_b32 v2, v3, v5, v2
-; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX90A-NEXT: v_mov_b32_e32 v1, v0
+; GFX90A-NEXT: v_pk_add_f16 v0, v1, v2
+; GFX90A-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: s_cbranch_execnz .LBB36_1
+; GFX90A-NEXT: s_cbranch_execnz .LBB58_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
-; GFX908-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b_pos:
+; GFX908-LABEL: flat_agent_atomic_fadd_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0
-; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX908-NEXT: v_and_b32_e32 v0, -4, v4
-; GFX908-NEXT: flat_load_dword v3, v[0:1]
-; GFX908-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX908-NEXT: v_lshlrev_b32_e32 v4, 3, v4
-; GFX908-NEXT: s_mov_b32 s4, 0xffff
-; GFX908-NEXT: v_lshlrev_b32_e64 v5, v4, s4
-; GFX908-NEXT: v_not_b32_e32 v5, v5
+; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
+; GFX908-NEXT: v_addc_co_u32_e32 v4, vcc, -1, v1, vcc
+; GFX908-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
+; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX908-NEXT: flat_load_dword v0, v[0:1]
; GFX908-NEXT: s_mov_b64 s[4:5], 0
-; GFX908-NEXT: v_lshlrev_b32_e32 v6, 16, v2
-; GFX908-NEXT: s_movk_i32 s6, 0x7fff
-; GFX908-NEXT: .LBB36_1: ; %atomicrmw.start
+; GFX908-NEXT: .LBB58_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX908-NEXT: v_add_f32_e32 v2, v2, v6
-; GFX908-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX908-NEXT: v_add3_u32 v7, v7, v2, s6
-; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX908-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc
-; GFX908-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX908-NEXT: v_and_or_b32 v2, v3, v5, v2
-; GFX908-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX908-NEXT: v_mov_b32_e32 v1, v0
+; GFX908-NEXT: v_pk_add_f16 v0, v1, v2
+; GFX908-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT: v_mov_b32_e32 v3, v2
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT: s_cbranch_execnz .LBB36_1
+; GFX908-NEXT: s_cbranch_execnz .LBB58_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX908-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
-; GFX8-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b_pos:
+; GFX8-LABEL: flat_agent_atomic_fadd_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fe, v0
-; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: v_and_b32_e32 v0, -4, v4
-; GFX8-NEXT: flat_load_dword v3, v[0:1]
-; GFX8-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX8-NEXT: v_lshlrev_b32_e32 v4, 3, v4
-; GFX8-NEXT: s_mov_b32 s4, 0xffff
-; GFX8-NEXT: v_lshlrev_b32_e64 v5, v4, s4
-; GFX8-NEXT: v_not_b32_e32 v5, v5
+; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0xfffff800, v0
+; GFX8-NEXT: v_addc_u32_e32 v4, vcc, -1, v1, vcc
+; GFX8-NEXT: flat_load_dword v0, v[3:4]
; GFX8-NEXT: s_mov_b64 s[4:5], 0
-; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v2
-; GFX8-NEXT: .LBB36_1: ; %atomicrmw.start
+; GFX8-NEXT: .LBB58_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT: v_add_f32_e32 v2, v2, v6
-; GFX8-NEXT: v_bfe_u32 v8, v2, 16, 1
-; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v2
-; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8
-; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v2
-; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX8-NEXT: v_cndmask_b32_e32 v2, v8, v9, vcc
-; GFX8-NEXT: v_and_b32_e32 v7, v3, v5
-; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-NEXT: v_or_b32_e32 v2, v7, v2
-; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX8-NEXT: v_mov_b32_e32 v1, v0
+; GFX8-NEXT: v_add_f16_sdwa v0, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_add_f16_e32 v5, v1, v2
+; GFX8-NEXT: v_or_b32_e32 v0, v5, v0
+; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT: v_mov_b32_e32 v3, v2
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT: s_cbranch_execnz .LBB36_1
+; GFX8-NEXT: s_cbranch_execnz .LBB58_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX7-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b_pos:
+; GFX7-LABEL: flat_agent_atomic_fadd_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0
-; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX7-NEXT: v_and_b32_e32 v0, -4, v4
-; GFX7-NEXT: flat_load_dword v3, v[0:1]
-; GFX7-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4
-; GFX7-NEXT: v_lshl_b32_e32 v5, 0xffff, v4
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT: v_not_b32_e32 v5, v5
+; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0xfffff800, v0
+; GFX7-NEXT: v_addc_u32_e32 v5, vcc, -1, v1, vcc
+; GFX7-NEXT: flat_load_dword v1, v[4:5]
+; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v3
+; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v2
; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX7-NEXT: .LBB36_1: ; %atomicrmw.start
-; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v0
+; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3
-; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX7-NEXT: v_add_f32_e32 v2, v2, v6
-; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX7-NEXT: v_and_b32_e32 v7, v3, v5
-; GFX7-NEXT: v_lshlrev_b32_e32 v2, v4, v2
-; GFX7-NEXT: v_or_b32_e32 v2, v7, v2
-; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT: .LBB58_1: ; %atomicrmw.start
+; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v1
+; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_add_f32_e32 v6, v6, v2
+; GFX7-NEXT: v_add_f32_e32 v7, v7, v3
+; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6
+; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7
+; GFX7-NEXT: v_or_b32_e32 v7, v0, v1
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v6
+; GFX7-NEXT: v_or_b32_e32 v6, v8, v0
+; GFX7-NEXT: flat_atomic_cmpswap v6, v[4:5], v[6:7] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v6
+; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v6
+; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v7
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT: v_mov_b32_e32 v3, v2
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT: s_cbranch_execnz .LBB36_1
+; GFX7-NEXT: s_cbranch_execnz .LBB58_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_setpc_b64 s[30:31]
- %gep = getelementptr bfloat, ptr %ptr, i64 1023
- %unused = atomicrmw fadd ptr %gep, bfloat %val syncscope("agent") seq_cst
- ret void
+ %gep = getelementptr <2 x half>, ptr %ptr, i64 -512
+ %result = atomicrmw fadd ptr %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+ ret <2 x half> %result
}
-
-define void @flat_agent_atomic_fadd_noret_bf16__offset12b_neg(ptr %ptr, bfloat %val) #0 {
-; GFX12-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b_neg:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
-; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
-; GFX12-NEXT: v_lshlrev_b32_e32 v6, 16, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_b32_e32 v0, -4, v4
-; GFX12-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: flat_load_b32 v3, v[0:1]
-; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4
-; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_not_b32_e32 v5, v5
-; GFX12-NEXT: .LBB37_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_lshrrev_b32_e32 v2, v4, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX12-NEXT: v_add_f32_e32 v2, v2, v6
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo
-; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2
-; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+
+define void @flat_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory(ptr %ptr, <2 x half> %val) #0 {
+; GFX12-LABEL: flat_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory:
+; GFX12: ; %bb.0:
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: s_wait_storecnt 0x0
+; GFX12-NEXT: flat_atomic_pk_add_f16 v[0:1], v2
+; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-NEXT: v_mov_b32_e32 v3, v2
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB37_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
-; GFX940-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b_neg:
+; GFX940-LABEL: flat_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: s_movk_i32 s0, 0xf800
-; GFX940-NEXT: s_mov_b32 s1, -1
-; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
-; GFX940-NEXT: v_and_b32_e32 v0, -4, v4
-; GFX940-NEXT: v_mov_b32_e32 v1, v5
-; GFX940-NEXT: flat_load_dword v3, v[0:1]
-; GFX940-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX940-NEXT: v_lshlrev_b32_e32 v4, 3, v4
-; GFX940-NEXT: s_mov_b32 s0, 0xffff
-; GFX940-NEXT: v_lshlrev_b32_e64 v5, v4, s0
-; GFX940-NEXT: v_not_b32_e32 v5, v5
-; GFX940-NEXT: s_mov_b64 s[0:1], 0
-; GFX940-NEXT: v_lshlrev_b32_e32 v6, 16, v2
-; GFX940-NEXT: s_movk_i32 s2, 0x7fff
-; GFX940-NEXT: .LBB37_1: ; %atomicrmw.start
-; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_add_f32_e32 v2, v2, v6
-; GFX940-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX940-NEXT: v_add3_u32 v7, v7, v2, s2
-; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX940-NEXT: s_nop 1
-; GFX940-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc
-; GFX940-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX940-NEXT: v_and_or_b32 v2, v3, v5, v2
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0
+; GFX940-NEXT: flat_atomic_pk_add_f16 v[0:1], v2
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT: v_mov_b32_e32 v3, v2
-; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT: s_cbranch_execnz .LBB37_1
-; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX940-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b_neg:
+; GFX11-LABEL: flat_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
-; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v0, -4, v4
-; GFX11-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX11-NEXT: flat_load_b32 v4, v[0:1]
; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: flat_load_b32 v3, v[0:1]
-; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v4
-; GFX11-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_not_b32_e32 v5, v5
-; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB37_1: ; %atomicrmw.start
+; GFX11-NEXT: .LBB59_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, v4, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-NEXT: v_add_f32_e32 v2, v2, v6
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, v4, v2
-; GFX11-NEXT: v_and_or_b32 v2, v3, v5, v2
+; GFX11-NEXT: v_pk_add_f16 v3, v4, v2
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc
+; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-NEXT: v_mov_b32_e32 v3, v2
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-NEXT: v_mov_b32_e32 v4, v3
; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB37_1
+; GFX11-NEXT: s_cbranch_execnz .LBB59_1
; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
-; GFX10-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b_neg:
+; GFX10-LABEL: flat_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
-; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
-; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v2
-; GFX10-NEXT: v_and_b32_e32 v0, -4, v4
-; GFX10-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX10-NEXT: flat_load_dword v4, v[0:1]
; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: flat_load_dword v3, v[0:1]
-; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v4
-; GFX10-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX10-NEXT: v_not_b32_e32 v5, v5
-; GFX10-NEXT: .LBB37_1: ; %atomicrmw.start
+; GFX10-NEXT: .LBB59_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX10-NEXT: v_add_f32_e32 v2, v2, v6
-; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX10-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
-; GFX10-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo
-; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX10-NEXT: v_and_or_b32 v2, v3, v5, v2
+; GFX10-NEXT: v_pk_add_f16 v3, v4, v2
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX10-NEXT: v_mov_b32_e32 v3, v2
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX10-NEXT: v_mov_b32_e32 v4, v3
; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: s_cbranch_execnz .LBB37_1
+; GFX10-NEXT: s_cbranch_execnz .LBB59_1
; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX90A-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b_neg:
+; GFX90A-LABEL: flat_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
-; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
-; GFX90A-NEXT: v_and_b32_e32 v0, -4, v4
-; GFX90A-NEXT: flat_load_dword v3, v[0:1]
-; GFX90A-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 3, v4
-; GFX90A-NEXT: s_mov_b32 s4, 0xffff
-; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v4, s4
-; GFX90A-NEXT: v_not_b32_e32 v5, v5
+; GFX90A-NEXT: flat_load_dword v5, v[0:1]
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v2
-; GFX90A-NEXT: s_movk_i32 s6, 0x7fff
-; GFX90A-NEXT: .LBB37_1: ; %atomicrmw.start
+; GFX90A-NEXT: .LBB59_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX90A-NEXT: v_add_f32_e32 v2, v2, v6
-; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s6
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX90A-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc
-; GFX90A-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX90A-NEXT: v_and_or_b32 v2, v3, v5, v2
-; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX90A-NEXT: v_pk_add_f16 v4, v5, v2
+; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: v_mov_b32_e32 v3, v2
+; GFX90A-NEXT: v_mov_b32_e32 v5, v3
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: s_cbranch_execnz .LBB37_1
+; GFX90A-NEXT: s_cbranch_execnz .LBB59_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
-; GFX908-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b_neg:
+; GFX908-LABEL: flat_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
-; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
-; GFX908-NEXT: v_and_b32_e32 v0, -4, v4
-; GFX908-NEXT: flat_load_dword v3, v[0:1]
-; GFX908-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX908-NEXT: v_lshlrev_b32_e32 v4, 3, v4
-; GFX908-NEXT: s_mov_b32 s4, 0xffff
-; GFX908-NEXT: v_lshlrev_b32_e64 v5, v4, s4
-; GFX908-NEXT: v_not_b32_e32 v5, v5
+; GFX908-NEXT: flat_load_dword v4, v[0:1]
; GFX908-NEXT: s_mov_b64 s[4:5], 0
-; GFX908-NEXT: v_lshlrev_b32_e32 v6, 16, v2
-; GFX908-NEXT: s_movk_i32 s6, 0x7fff
-; GFX908-NEXT: .LBB37_1: ; %atomicrmw.start
+; GFX908-NEXT: .LBB59_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX908-NEXT: v_add_f32_e32 v2, v2, v6
-; GFX908-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX908-NEXT: v_add3_u32 v7, v7, v2, s6
-; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX908-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc
-; GFX908-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX908-NEXT: v_and_or_b32 v2, v3, v5, v2
-; GFX908-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX908-NEXT: v_pk_add_f16 v3, v4, v2
+; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT: v_mov_b32_e32 v3, v2
+; GFX908-NEXT: v_mov_b32_e32 v4, v3
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT: s_cbranch_execnz .LBB37_1
+; GFX908-NEXT: s_cbranch_execnz .LBB59_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX908-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
-; GFX8-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b_neg:
+; GFX8-LABEL: flat_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0xfffff800, v0
-; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc
-; GFX8-NEXT: v_and_b32_e32 v0, -4, v4
-; GFX8-NEXT: flat_load_dword v3, v[0:1]
-; GFX8-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX8-NEXT: v_lshlrev_b32_e32 v4, 3, v4
-; GFX8-NEXT: s_mov_b32 s4, 0xffff
-; GFX8-NEXT: v_lshlrev_b32_e64 v5, v4, s4
-; GFX8-NEXT: v_not_b32_e32 v5, v5
+; GFX8-NEXT: flat_load_dword v4, v[0:1]
; GFX8-NEXT: s_mov_b64 s[4:5], 0
-; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v2
-; GFX8-NEXT: .LBB37_1: ; %atomicrmw.start
+; GFX8-NEXT: .LBB59_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT: v_add_f32_e32 v2, v2, v6
-; GFX8-NEXT: v_bfe_u32 v8, v2, 16, 1
-; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v2
-; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8
-; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v2
-; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX8-NEXT: v_cndmask_b32_e32 v2, v8, v9, vcc
-; GFX8-NEXT: v_and_b32_e32 v7, v3, v5
-; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-NEXT: v_or_b32_e32 v2, v7, v2
-; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX8-NEXT: v_add_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_add_f16_e32 v5, v4, v2
+; GFX8-NEXT: v_or_b32_e32 v3, v5, v3
+; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT: v_mov_b32_e32 v3, v2
+; GFX8-NEXT: v_mov_b32_e32 v4, v3
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT: s_cbranch_execnz .LBB37_1
+; GFX8-NEXT: s_cbranch_execnz .LBB59_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX7-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b_neg:
+; GFX7-LABEL: flat_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0xfffff800, v0
-; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc
-; GFX7-NEXT: v_and_b32_e32 v0, -4, v4
-; GFX7-NEXT: flat_load_dword v3, v[0:1]
-; GFX7-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4
-; GFX7-NEXT: v_lshl_b32_e32 v5, 0xffff, v4
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT: v_not_b32_e32 v5, v5
+; GFX7-NEXT: flat_load_dword v5, v[0:1]
+; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2
; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX7-NEXT: .LBB37_1: ; %atomicrmw.start
-; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v3
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3
-; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX7-NEXT: v_add_f32_e32 v2, v2, v6
-; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX7-NEXT: v_and_b32_e32 v7, v3, v5
-; GFX7-NEXT: v_lshlrev_b32_e32 v2, v4, v2
-; GFX7-NEXT: v_or_b32_e32 v2, v7, v2
-; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v5
+; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v5
+; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3
+; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6
+; GFX7-NEXT: .LBB59_1: ; %atomicrmw.start
+; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5
+; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4
+; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5
+; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX7-NEXT: v_add_f32_e32 v6, v6, v2
+; GFX7-NEXT: v_add_f32_e32 v7, v7, v3
+; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6
+; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7
+; GFX7-NEXT: v_or_b32_e32 v6, v4, v5
+; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8
+; GFX7-NEXT: v_or_b32_e32 v5, v7, v4
+; GFX7-NEXT: flat_atomic_cmpswap v7, v[0:1], v[5:6] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7
+; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7
+; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT: v_mov_b32_e32 v3, v2
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT: s_cbranch_execnz .LBB37_1
+; GFX7-NEXT: s_cbranch_execnz .LBB59_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_setpc_b64 s[30:31]
- %gep = getelementptr bfloat, ptr %ptr, i64 -1024
- %unused = atomicrmw fadd ptr %gep, bfloat %val syncscope("agent") seq_cst
+ %unused = atomicrmw fadd ptr %ptr, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
ret void
}
-define bfloat @flat_agent_atomic_fadd_ret_bf16__offset12b_pos__align4(ptr %ptr, bfloat %val) #0 {
-; GFX12-LABEL: flat_agent_atomic_fadd_ret_bf16__offset12b_pos__align4:
+define void @flat_agent_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr %ptr, <2 x half> %val) #0 {
+; GFX12-LABEL: flat_agent_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: s_wait_expcnt 0x0
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2046
-; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: .LBB38_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v4, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v4
-; GFX12-NEXT: v_add_f32_e32 v3, v3, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v6, 0x400000, v3
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX12-NEXT: v_add3_u32 v5, v5, v3, 0x7fff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo
-; GFX12-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 th:TH_ATOMIC_RETURN
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: flat_atomic_pk_add_f16 v[0:1], v2 offset:2044
+; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB38_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: v_mov_b32_e32 v0, v3
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
-; GFX940-LABEL: flat_agent_atomic_fadd_ret_bf16__offset12b_pos__align4:
+; GFX940-LABEL: flat_agent_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: flat_load_dword v3, v[0:1] offset:2046
-; GFX940-NEXT: s_mov_b64 s[0:1], 0
-; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX940-NEXT: s_movk_i32 s2, 0x7fff
-; GFX940-NEXT: s_mov_b32 s3, 0xffff0000
-; GFX940-NEXT: .LBB38_1: ; %atomicrmw.start
-; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_mov_b32_e32 v5, v3
-; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v5
-; GFX940-NEXT: v_add_f32_e32 v3, v3, v2
-; GFX940-NEXT: v_bfe_u32 v4, v3, 16, 1
-; GFX940-NEXT: v_or_b32_e32 v6, 0x400000, v3
-; GFX940-NEXT: v_add3_u32 v4, v4, v3, s2
-; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; GFX940-NEXT: s_nop 1
-; GFX940-NEXT: v_cndmask_b32_e32 v3, v4, v6, vcc
-; GFX940-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX940-NEXT: v_and_or_b32 v4, v5, s3, v3
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2046 sc0
+; GFX940-NEXT: flat_atomic_pk_add_f16 v[0:1], v2 offset:2044
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
-; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT: s_cbranch_execnz .LBB38_1
-; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX940-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX940-NEXT: v_mov_b32_e32 v0, v3
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: flat_agent_atomic_fadd_ret_bf16__offset12b_pos__align4:
+; GFX11-LABEL: flat_agent_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2046
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-NEXT: flat_load_b32 v4, v[0:1] offset:2044
; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB38_1: ; %atomicrmw.start
+; GFX11-NEXT: .LBB60_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v4, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v4
-; GFX11-NEXT: v_add_f32_e32 v3, v3, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v3
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT: v_add3_u32 v5, v5, v3, 0x7fff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
+; GFX11-NEXT: v_pk_add_f16 v3, v4, v2
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2046 glc
+; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 glc
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-NEXT: v_mov_b32_e32 v4, v3
; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB38_1
+; GFX11-NEXT: s_cbranch_execnz .LBB60_1
; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: v_mov_b32_e32 v0, v3
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
-; GFX10-LABEL: flat_agent_atomic_fadd_ret_bf16__offset12b_pos__align4:
+; GFX10-LABEL: flat_agent_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
-; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, 0, v1, vcc_lo
-; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v2
+; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fc, v0
+; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: flat_load_dword v0, v[3:4]
-; GFX10-NEXT: .LBB38_1: ; %atomicrmw.start
+; GFX10-NEXT: flat_load_dword v4, v[0:1]
+; GFX10-NEXT: .LBB60_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v6, v0
-; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v6
-; GFX10-NEXT: v_add_f32_e32 v0, v0, v1
-; GFX10-NEXT: v_bfe_u32 v2, v0, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v0
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
-; GFX10-NEXT: v_add3_u32 v2, v2, v0, 0x7fff
-; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v5, vcc_lo
-; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX10-NEXT: v_and_or_b32 v5, 0xffff0000, v6, v0
+; GFX10-NEXT: v_pk_add_f16 v3, v4, v2
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
+; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX10-NEXT: v_mov_b32_e32 v4, v3
; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: s_cbranch_execnz .LBB38_1
+; GFX10-NEXT: s_cbranch_execnz .LBB60_1
; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX90A-LABEL: flat_agent_atomic_fadd_ret_bf16__offset12b_pos__align4:
+; GFX90A-LABEL: flat_agent_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2046
+; GFX90A-NEXT: flat_load_dword v5, v[0:1] offset:2044
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX90A-NEXT: s_movk_i32 s6, 0x7fff
-; GFX90A-NEXT: s_mov_b32 s7, 0xffff0000
-; GFX90A-NEXT: .LBB38_1: ; %atomicrmw.start
+; GFX90A-NEXT: .LBB60_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v5, v3
-; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v5
-; GFX90A-NEXT: v_add_f32_e32 v3, v3, v2
-; GFX90A-NEXT: v_bfe_u32 v4, v3, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v3
-; GFX90A-NEXT: v_add3_u32 v4, v4, v3, s6
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; GFX90A-NEXT: v_cndmask_b32_e32 v3, v4, v6, vcc
-; GFX90A-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX90A-NEXT: v_and_or_b32 v4, v5, s7, v3
-; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2046 glc
+; GFX90A-NEXT: v_pk_add_f16 v4, v5, v2
+; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v5, v3
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: s_cbranch_execnz .LBB38_1
+; GFX90A-NEXT: s_cbranch_execnz .LBB60_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: v_mov_b32_e32 v0, v3
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
-; GFX908-LABEL: flat_agent_atomic_fadd_ret_bf16__offset12b_pos__align4:
+; GFX908-LABEL: flat_agent_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2046
+; GFX908-NEXT: flat_load_dword v4, v[0:1] offset:2044
; GFX908-NEXT: s_mov_b64 s[4:5], 0
-; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX908-NEXT: s_movk_i32 s6, 0x7fff
-; GFX908-NEXT: s_mov_b32 s7, 0xffff0000
-; GFX908-NEXT: .LBB38_1: ; %atomicrmw.start
+; GFX908-NEXT: .LBB60_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v4, v3
-; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v4
-; GFX908-NEXT: v_add_f32_e32 v3, v3, v2
-; GFX908-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX908-NEXT: v_or_b32_e32 v6, 0x400000, v3
-; GFX908-NEXT: v_add3_u32 v5, v5, v3, s6
-; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; GFX908-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc
-; GFX908-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX908-NEXT: v_and_or_b32 v3, v4, s7, v3
-; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2046 glc
+; GFX908-NEXT: v_pk_add_f16 v3, v4, v2
+; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT: v_mov_b32_e32 v4, v3
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT: s_cbranch_execnz .LBB38_1
+; GFX908-NEXT: s_cbranch_execnz .LBB60_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX908-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX908-NEXT: v_mov_b32_e32 v0, v3
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
-; GFX8-LABEL: flat_agent_atomic_fadd_ret_bf16__offset12b_pos__align4:
+; GFX8-LABEL: flat_agent_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fe, v0
-; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_load_dword v0, v[3:4]
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT: flat_load_dword v4, v[0:1]
; GFX8-NEXT: s_mov_b64 s[4:5], 0
-; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2
-; GFX8-NEXT: .LBB38_1: ; %atomicrmw.start
+; GFX8-NEXT: .LBB60_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v6, v0
-; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v6
-; GFX8-NEXT: v_add_f32_e32 v0, v0, v1
-; GFX8-NEXT: v_bfe_u32 v5, v0, 16, 1
-; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v0
-; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5
-; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v0
-; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v6
-; GFX8-NEXT: v_cndmask_b32_e32 v0, v5, v7, vcc
-; GFX8-NEXT: v_or_b32_sdwa v5, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
+; GFX8-NEXT: v_add_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_add_f16_e32 v5, v4, v2
+; GFX8-NEXT: v_or_b32_e32 v3, v5, v3
+; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT: v_mov_b32_e32 v4, v3
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT: s_cbranch_execnz .LBB38_1
+; GFX8-NEXT: s_cbranch_execnz .LBB60_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX7-LABEL: flat_agent_atomic_fadd_ret_bf16__offset12b_pos__align4:
+; GFX7-LABEL: flat_agent_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7fe, v0
+; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7fc, v0
; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX7-NEXT: flat_load_dword v3, v[0:1]
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT: flat_load_dword v5, v[0:1]
+; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2
; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX7-NEXT: .LBB38_1: ; %atomicrmw.start
-; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v3
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v4, v3
-; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v4
-; GFX7-NEXT: v_add_f32_e32 v3, v3, v2
-; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
-; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX7-NEXT: v_or_b32_e32 v3, v5, v3
-; GFX7-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v5
+; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v5
+; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3
+; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6
+; GFX7-NEXT: .LBB60_1: ; %atomicrmw.start
+; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5
+; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4
+; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5
+; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX7-NEXT: v_add_f32_e32 v6, v6, v2
+; GFX7-NEXT: v_add_f32_e32 v7, v7, v3
+; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6
+; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7
+; GFX7-NEXT: v_or_b32_e32 v6, v4, v5
+; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8
+; GFX7-NEXT: v_or_b32_e32 v5, v7, v4
+; GFX7-NEXT: flat_atomic_cmpswap v7, v[0:1], v[5:6] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7
+; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7
+; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT: s_cbranch_execnz .LBB38_1
+; GFX7-NEXT: s_cbranch_execnz .LBB60_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v3
; GFX7-NEXT: s_setpc_b64 s[30:31]
- %gep = getelementptr bfloat, ptr %ptr, i64 1023
- %result = atomicrmw fadd ptr %gep, bfloat %val syncscope("agent") seq_cst, align 4
- ret bfloat %result
+ %gep = getelementptr <2 x half>, ptr %ptr, i64 511
+ %unused = atomicrmw fadd ptr %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+ ret void
}
-define void @flat_agent_atomic_fadd_noret_bf16__offset12b__align4_pos(ptr %ptr, bfloat %val) #0 {
-; GFX12-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b__align4_pos:
+define void @flat_agent_atomic_fadd_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr %ptr, <2 x half> %val) #0 {
+; GFX12-LABEL: flat_agent_atomic_fadd_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: s_wait_expcnt 0x0
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:2046
-; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: .LBB39_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_add_f32_e32 v2, v2, v4
-; GFX12-NEXT: v_bfe_u32 v5, v2, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v6, 0x400000, v2
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_add3_u32 v5, v5, v2, 0x7fff
-; GFX12-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc_lo
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX12-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2046 th:TH_ATOMIC_RETURN
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_storecnt 0x0
+; GFX12-NEXT: flat_atomic_pk_add_f16 v[0:1], v2 offset:-2048
+; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-NEXT: v_mov_b32_e32 v3, v2
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB39_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
-; GFX940-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b__align4_pos:
+; GFX940-LABEL: flat_agent_atomic_fadd_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: flat_load_dword v3, v[0:1] offset:2046
-; GFX940-NEXT: s_mov_b64 s[0:1], 0
-; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX940-NEXT: s_movk_i32 s2, 0x7fff
-; GFX940-NEXT: s_mov_b32 s3, 0xffff0000
-; GFX940-NEXT: .LBB39_1: ; %atomicrmw.start
-; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX940-NEXT: v_add_f32_e32 v2, v2, v4
-; GFX940-NEXT: v_bfe_u32 v5, v2, 16, 1
-; GFX940-NEXT: v_or_b32_e32 v6, 0x400000, v2
-; GFX940-NEXT: v_add3_u32 v5, v5, v2, s2
-; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; GFX940-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
; GFX940-NEXT: s_nop 1
-; GFX940-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc
-; GFX940-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX940-NEXT: v_and_or_b32 v2, v3, s3, v2
+; GFX940-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2046 sc0
+; GFX940-NEXT: flat_atomic_pk_add_f16 v[0:1], v2
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT: v_mov_b32_e32 v3, v2
-; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT: s_cbranch_execnz .LBB39_1
-; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX940-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b__align4_pos:
+; GFX11-LABEL: flat_agent_atomic_fadd_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2046
-; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
+; GFX11-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, -1, v1, vcc_lo
+; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
+; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX11-NEXT: flat_load_b32 v4, v[3:4]
; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB39_1: ; %atomicrmw.start
+; GFX11-NEXT: .LBB61_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_f32_e32 v2, v2, v4
-; GFX11-NEXT: v_bfe_u32 v5, v2, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v2
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add3_u32 v5, v5, v2, 0x7fff
-; GFX11-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX11-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2
+; GFX11-NEXT: v_pk_add_f16 v3, v4, v2
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2046 glc
+; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-NEXT: v_mov_b32_e32 v3, v2
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-NEXT: v_mov_b32_e32 v4, v3
; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB39_1
+; GFX11-NEXT: s_cbranch_execnz .LBB61_1
; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
-; GFX10-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b__align4_pos:
+; GFX10-LABEL: flat_agent_atomic_fadd_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fe, v0
-; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
-; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
+; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: flat_load_dword v3, v[0:1]
-; GFX10-NEXT: .LBB39_1: ; %atomicrmw.start
+; GFX10-NEXT: flat_load_dword v4, v[0:1]
+; GFX10-NEXT: .LBB61_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX10-NEXT: v_add_f32_e32 v2, v2, v4
-; GFX10-NEXT: v_bfe_u32 v5, v2, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v2
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX10-NEXT: v_add3_u32 v5, v5, v2, 0x7fff
-; GFX10-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc_lo
-; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX10-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2
+; GFX10-NEXT: v_pk_add_f16 v3, v4, v2
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX10-NEXT: v_mov_b32_e32 v3, v2
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX10-NEXT: v_mov_b32_e32 v4, v3
; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: s_cbranch_execnz .LBB39_1
+; GFX10-NEXT: s_cbranch_execnz .LBB61_1
; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX90A-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b__align4_pos:
+; GFX90A-LABEL: flat_agent_atomic_fadd_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2046
+; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
+; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc
+; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
+; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX90A-NEXT: flat_load_dword v1, v[0:1]
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX90A-NEXT: s_movk_i32 s6, 0x7fff
-; GFX90A-NEXT: s_mov_b32 s7, 0xffff0000
-; GFX90A-NEXT: .LBB39_1: ; %atomicrmw.start
+; GFX90A-NEXT: .LBB61_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX90A-NEXT: v_add_f32_e32 v2, v2, v4
-; GFX90A-NEXT: v_bfe_u32 v5, v2, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v2
-; GFX90A-NEXT: v_add3_u32 v5, v5, v2, s6
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX90A-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc
-; GFX90A-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX90A-NEXT: v_and_or_b32 v2, v3, s7, v2
-; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2046 glc
+; GFX90A-NEXT: v_pk_add_f16 v0, v1, v2
+; GFX90A-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: v_mov_b32_e32 v3, v2
+; GFX90A-NEXT: v_mov_b32_e32 v1, v0
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: s_cbranch_execnz .LBB39_1
+; GFX90A-NEXT: s_cbranch_execnz .LBB61_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
-; GFX908-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b__align4_pos:
+; GFX908-LABEL: flat_agent_atomic_fadd_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2046
+; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
+; GFX908-NEXT: v_addc_co_u32_e32 v4, vcc, -1, v1, vcc
+; GFX908-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
+; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX908-NEXT: flat_load_dword v1, v[0:1]
; GFX908-NEXT: s_mov_b64 s[4:5], 0
-; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX908-NEXT: s_movk_i32 s6, 0x7fff
-; GFX908-NEXT: s_mov_b32 s7, 0xffff0000
-; GFX908-NEXT: .LBB39_1: ; %atomicrmw.start
+; GFX908-NEXT: .LBB61_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX908-NEXT: v_add_f32_e32 v2, v2, v4
-; GFX908-NEXT: v_bfe_u32 v5, v2, 16, 1
-; GFX908-NEXT: v_or_b32_e32 v6, 0x400000, v2
-; GFX908-NEXT: v_add3_u32 v5, v5, v2, s6
-; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX908-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc
-; GFX908-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX908-NEXT: v_and_or_b32 v2, v3, s7, v2
-; GFX908-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2046 glc
+; GFX908-NEXT: v_pk_add_f16 v0, v1, v2
+; GFX908-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT: v_mov_b32_e32 v3, v2
+; GFX908-NEXT: v_mov_b32_e32 v1, v0
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT: s_cbranch_execnz .LBB39_1
+; GFX908-NEXT: s_cbranch_execnz .LBB61_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX908-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
-; GFX8-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b__align4_pos:
+; GFX8-LABEL: flat_agent_atomic_fadd_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fe, v0
-; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_load_dword v3, v[0:1]
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0xfffff800, v0
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc
+; GFX8-NEXT: flat_load_dword v4, v[0:1]
; GFX8-NEXT: s_mov_b64 s[4:5], 0
-; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX8-NEXT: .LBB39_1: ; %atomicrmw.start
+; GFX8-NEXT: .LBB61_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX8-NEXT: v_add_f32_e32 v2, v2, v4
-; GFX8-NEXT: v_bfe_u32 v6, v2, 16, 1
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v2
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6
-; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v2
-; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
-; GFX8-NEXT: v_cndmask_b32_e32 v2, v6, v7, vcc
-; GFX8-NEXT: v_or_b32_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX8-NEXT: v_add_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_add_f16_e32 v5, v4, v2
+; GFX8-NEXT: v_or_b32_e32 v3, v5, v3
+; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT: v_mov_b32_e32 v3, v2
+; GFX8-NEXT: v_mov_b32_e32 v4, v3
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT: s_cbranch_execnz .LBB39_1
+; GFX8-NEXT: s_cbranch_execnz .LBB61_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX7-LABEL: flat_agent_atomic_fadd_noret_bf16__offset12b__align4_pos:
+; GFX7-LABEL: flat_agent_atomic_fadd_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7fe, v0
-; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX7-NEXT: flat_load_dword v3, v[0:1]
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0
+; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc
+; GFX7-NEXT: flat_load_dword v5, v[0:1]
+; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2
; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
-; GFX7-NEXT: .LBB39_1: ; %atomicrmw.start
-; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v3
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX7-NEXT: v_add_f32_e32 v2, v2, v4
-; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
-; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX7-NEXT: v_or_b32_e32 v2, v5, v2
-; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v5
+; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v5
+; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3
+; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6
+; GFX7-NEXT: .LBB61_1: ; %atomicrmw.start
+; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5
+; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4
+; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5
+; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX7-NEXT: v_add_f32_e32 v6, v6, v2
+; GFX7-NEXT: v_add_f32_e32 v7, v7, v3
+; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6
+; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7
+; GFX7-NEXT: v_or_b32_e32 v6, v4, v5
+; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8
+; GFX7-NEXT: v_or_b32_e32 v5, v7, v4
+; GFX7-NEXT: flat_atomic_cmpswap v7, v[0:1], v[5:6] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7
+; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7
+; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT: v_mov_b32_e32 v3, v2
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT: s_cbranch_execnz .LBB39_1
+; GFX7-NEXT: s_cbranch_execnz .LBB61_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_setpc_b64 s[30:31]
- %gep = getelementptr bfloat, ptr %ptr, i64 1023
- %unused = atomicrmw fadd ptr %gep, bfloat %val syncscope("agent") seq_cst, align 4
+ %gep = getelementptr <2 x half>, ptr %ptr, i64 -512
+ %unused = atomicrmw fadd ptr %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
ret void
}
-define bfloat @flat_system_atomic_fadd_ret_bf16__offset12b_pos(ptr %ptr, bfloat %val) #0 {
-; GFX12-LABEL: flat_system_atomic_fadd_ret_bf16__offset12b_pos:
+define <2 x half> @flat_system_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr %ptr, <2 x half> %val) #0 {
+; GFX12-LABEL: flat_system_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
-; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
-; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: flat_load_b32 v5, v[0:1]
-; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3
-; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_not_b32_e32 v4, v4
-; GFX12-NEXT: .LBB40_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v6, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6
-; GFX12-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_add_f32_e32 v5, v5, v2
-; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v5
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] th:TH_ATOMIC_RETURN
+; GFX12-NEXT: flat_atomic_pk_add_f16 v0, v[0:1], v2 offset:2044 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SYS
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB40_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
-; GFX940-LABEL: flat_system_atomic_fadd_ret_bf16__offset12b_pos:
+; GFX940-LABEL: flat_system_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: s_mov_b64 s[0:1], 0x7fe
-; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
-; GFX940-NEXT: v_and_b32_e32 v0, -4, v4
-; GFX940-NEXT: v_mov_b32_e32 v1, v5
-; GFX940-NEXT: flat_load_dword v5, v[0:1]
-; GFX940-NEXT: v_and_b32_e32 v3, 3, v4
-; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3
-; GFX940-NEXT: s_mov_b32 s0, 0xffff
-; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0
-; GFX940-NEXT: v_not_b32_e32 v4, v4
-; GFX940-NEXT: s_mov_b64 s[0:1], 0
-; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX940-NEXT: s_movk_i32 s2, 0x7fff
-; GFX940-NEXT: .LBB40_1: ; %atomicrmw.start
-; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_mov_b32_e32 v7, v5
-; GFX940-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_add_f32_e32 v5, v5, v2
-; GFX940-NEXT: v_bfe_u32 v6, v5, 16, 1
-; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v5
-; GFX940-NEXT: v_add3_u32 v6, v6, v5, s2
-; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX940-NEXT: s_nop 1
-; GFX940-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc
-; GFX940-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX940-NEXT: v_and_or_b32 v6, v7, v4, v5
; GFX940-NEXT: buffer_wbl2 sc0 sc1
-; GFX940-NEXT: flat_atomic_cmpswap v5, v[0:1], v[6:7] sc0 sc1
+; GFX940-NEXT: flat_atomic_pk_add_f16 v0, v[0:1], v2 offset:2044 sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_inv sc0 sc1
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7
-; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT: s_cbranch_execnz .LBB40_1
-; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX940-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v5
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: flat_system_atomic_fadd_ret_bf16__offset12b_pos:
+; GFX11-LABEL: flat_system_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX11-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2044
; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: flat_load_b32 v5, v[0:1]
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3
-; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_not_b32_e32 v4, v4
-; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB40_1: ; %atomicrmw.start
+; GFX11-NEXT: .LBB62_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v6, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6
-; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_f32_e32 v5, v5, v2
-; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v5
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX11-NEXT: v_mov_b32_e32 v4, v3
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX11-NEXT: v_pk_add_f16 v3, v4, v2
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v5, v[0:1], v[5:6] glc
+; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 glc
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB40_1
+; GFX11-NEXT: s_cbranch_execnz .LBB62_1
; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX11-NEXT: v_mov_b32_e32 v0, v3
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
-; GFX10-LABEL: flat_system_atomic_fadd_ret_bf16__offset12b_pos:
+; GFX10-LABEL: flat_system_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
-; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
-; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX10-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX10-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fc, v0
+; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, 0, v1, vcc_lo
; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: flat_load_dword v5, v[0:1]
-; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3
-; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX10-NEXT: v_not_b32_e32 v4, v4
-; GFX10-NEXT: .LBB40_1: ; %atomicrmw.start
+; GFX10-NEXT: flat_load_dword v0, v[3:4]
+; GFX10-NEXT: .LBB62_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v6, v5
-; GFX10-NEXT: v_lshrrev_b32_sdwa v5, v3, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX10-NEXT: v_add_f32_e32 v5, v5, v2
-; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v5
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
-; GFX10-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX10-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX10-NEXT: v_mov_b32_e32 v1, v0
+; GFX10-NEXT: v_pk_add_f16 v0, v1, v2
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
+; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: s_cbranch_execnz .LBB40_1
+; GFX10-NEXT: s_cbranch_execnz .LBB62_1
; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: v_lshrrev_b32_e32 v0, v3, v5
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX90A-LABEL: flat_system_atomic_fadd_ret_bf16__offset12b_pos:
+; GFX90A-LABEL: flat_system_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0
-; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX90A-NEXT: flat_load_dword v5, v[0:1]
-; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3
-; GFX90A-NEXT: s_mov_b32 s4, 0xffff
-; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4
-; GFX90A-NEXT: v_not_b32_e32 v4, v4
+; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX90A-NEXT: s_movk_i32 s6, 0x7fff
-; GFX90A-NEXT: .LBB40_1: ; %atomicrmw.start
+; GFX90A-NEXT: .LBB62_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v7, v5
-; GFX90A-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX90A-NEXT: v_add_f32_e32 v5, v5, v2
-; GFX90A-NEXT: v_bfe_u32 v6, v5, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v5
-; GFX90A-NEXT: v_add3_u32 v6, v6, v5, s6
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX90A-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc
-; GFX90A-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX90A-NEXT: v_and_or_b32 v6, v7, v4, v5
+; GFX90A-NEXT: v_mov_b32_e32 v5, v3
+; GFX90A-NEXT: v_pk_add_f16 v4, v5, v2
; GFX90A-NEXT: buffer_wbl2
-; GFX90A-NEXT: flat_atomic_cmpswap v5, v[0:1], v[6:7] glc
+; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_invl2
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: s_cbranch_execnz .LBB40_1
+; GFX90A-NEXT: s_cbranch_execnz .LBB62_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX90A-NEXT: v_mov_b32_e32 v0, v3
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
-; GFX908-LABEL: flat_system_atomic_fadd_ret_bf16__offset12b_pos:
+; GFX908-LABEL: flat_system_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0
-; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX908-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX908-NEXT: flat_load_dword v5, v[0:1]
-; GFX908-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX908-NEXT: v_lshlrev_b32_e32 v3, 3, v3
-; GFX908-NEXT: s_mov_b32 s4, 0xffff
-; GFX908-NEXT: v_lshlrev_b32_e64 v4, v3, s4
-; GFX908-NEXT: v_not_b32_e32 v4, v4
+; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044
; GFX908-NEXT: s_mov_b64 s[4:5], 0
-; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX908-NEXT: s_movk_i32 s6, 0x7fff
-; GFX908-NEXT: .LBB40_1: ; %atomicrmw.start
+; GFX908-NEXT: .LBB62_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v6, v5
-; GFX908-NEXT: v_lshrrev_b32_sdwa v5, v3, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX908-NEXT: v_add_f32_e32 v5, v5, v2
-; GFX908-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v5
-; GFX908-NEXT: v_add3_u32 v7, v7, v5, s6
-; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX908-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc
-; GFX908-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX908-NEXT: v_and_or_b32 v5, v6, v4, v5
-; GFX908-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
+; GFX908-NEXT: v_mov_b32_e32 v4, v3
+; GFX908-NEXT: v_pk_add_f16 v3, v4, v2
+; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT: s_cbranch_execnz .LBB40_1
+; GFX908-NEXT: s_cbranch_execnz .LBB62_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX908-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX908-NEXT: v_mov_b32_e32 v0, v3
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
-; GFX8-LABEL: flat_system_atomic_fadd_ret_bf16__offset12b_pos:
+; GFX8-LABEL: flat_system_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fe, v0
-; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX8-NEXT: flat_load_dword v5, v[0:1]
-; GFX8-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX8-NEXT: v_lshlrev_b32_e32 v3, 3, v3
-; GFX8-NEXT: s_mov_b32 s4, 0xffff
-; GFX8-NEXT: v_lshlrev_b32_e64 v4, v3, s4
-; GFX8-NEXT: v_not_b32_e32 v4, v4
+; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fc, v0
+; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc
+; GFX8-NEXT: flat_load_dword v0, v[3:4]
; GFX8-NEXT: s_mov_b64 s[4:5], 0
-; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX8-NEXT: .LBB40_1: ; %atomicrmw.start
+; GFX8-NEXT: .LBB62_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v6, v5
-; GFX8-NEXT: v_lshrrev_b32_sdwa v5, v3, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT: v_add_f32_e32 v5, v5, v2
-; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5
-; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8
-; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX8-NEXT: v_and_b32_e32 v7, v6, v4
-; GFX8-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-NEXT: v_or_b32_e32 v5, v7, v5
-; GFX8-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
+; GFX8-NEXT: v_mov_b32_e32 v1, v0
+; GFX8-NEXT: v_add_f16_sdwa v0, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_add_f16_e32 v5, v1, v2
+; GFX8-NEXT: v_or_b32_e32 v0, v5, v0
+; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT: s_cbranch_execnz .LBB40_1
+; GFX8-NEXT: s_cbranch_execnz .LBB62_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, v3, v5
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX7-LABEL: flat_system_atomic_fadd_ret_bf16__offset12b_pos:
+; GFX7-LABEL: flat_system_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_add_i32_e32 v3, vcc, 0x7fe, v0
-; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX7-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX7-NEXT: flat_load_dword v5, v[0:1]
-; GFX7-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX7-NEXT: v_lshlrev_b32_e32 v3, 3, v3
-; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v3
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT: v_not_b32_e32 v4, v4
+; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x7fc, v0
+; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
+; GFX7-NEXT: flat_load_dword v1, v[4:5]
+; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v3
+; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v2
; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX7-NEXT: .LBB40_1: ; %atomicrmw.start
-; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v0
+; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v6, v5
-; GFX7-NEXT: v_lshrrev_b32_e32 v5, v3, v6
-; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX7-NEXT: v_add_f32_e32 v5, v5, v2
-; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX7-NEXT: v_and_b32_e32 v7, v6, v4
-; GFX7-NEXT: v_lshlrev_b32_e32 v5, v3, v5
-; GFX7-NEXT: v_or_b32_e32 v5, v7, v5
-; GFX7-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
+; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT: .LBB62_1: ; %atomicrmw.start
+; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v1
+; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_add_f32_e32 v6, v6, v2
+; GFX7-NEXT: v_add_f32_e32 v7, v7, v3
+; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6
+; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7
+; GFX7-NEXT: v_or_b32_e32 v7, v0, v1
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v6
+; GFX7-NEXT: v_or_b32_e32 v6, v8, v0
+; GFX7-NEXT: flat_atomic_cmpswap v6, v[4:5], v[6:7] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v6
+; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v6
+; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v7
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT: s_cbranch_execnz .LBB40_1
+; GFX7-NEXT: s_cbranch_execnz .LBB62_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, v3, v5
-; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
- %gep = getelementptr bfloat, ptr %ptr, i64 1023
- %result = atomicrmw fadd ptr %gep, bfloat %val seq_cst
- ret bfloat %result
+ %gep = getelementptr <2 x half>, ptr %ptr, i64 511
+ %result = atomicrmw fadd ptr %gep, <2 x half> %val seq_cst, !amdgpu.no.fine.grained.memory !0
+ ret <2 x half> %result
}
-define void @flat_system_atomic_fadd_noret_bf16__offset12b_pos(ptr %ptr, bfloat %val) #0 {
-; GFX12-LABEL: flat_system_atomic_fadd_noret_bf16__offset12b_pos:
+define void @flat_system_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr %ptr, <2 x half> %val) #0 {
+; GFX12-LABEL: flat_system_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: s_wait_expcnt 0x0
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
-; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
-; GFX12-NEXT: v_lshlrev_b32_e32 v6, 16, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_b32_e32 v0, -4, v4
-; GFX12-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: flat_load_b32 v3, v[0:1]
-; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4
-; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_not_b32_e32 v5, v5
-; GFX12-NEXT: .LBB41_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: v_lshrrev_b32_e32 v2, v4, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX12-NEXT: v_add_f32_e32 v2, v2, v6
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo
-; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2
-; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: flat_atomic_pk_add_f16 v[0:1], v2 offset:2044
+; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SYS
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-NEXT: v_mov_b32_e32 v3, v2
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB41_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
-; GFX940-LABEL: flat_system_atomic_fadd_noret_bf16__offset12b_pos:
+; GFX940-LABEL: flat_system_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: s_mov_b64 s[0:1], 0x7fe
-; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
-; GFX940-NEXT: v_and_b32_e32 v0, -4, v4
-; GFX940-NEXT: v_mov_b32_e32 v1, v5
-; GFX940-NEXT: flat_load_dword v3, v[0:1]
-; GFX940-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX940-NEXT: v_lshlrev_b32_e32 v4, 3, v4
-; GFX940-NEXT: s_mov_b32 s0, 0xffff
-; GFX940-NEXT: v_lshlrev_b32_e64 v5, v4, s0
-; GFX940-NEXT: v_not_b32_e32 v5, v5
-; GFX940-NEXT: s_mov_b64 s[0:1], 0
-; GFX940-NEXT: v_lshlrev_b32_e32 v6, 16, v2
-; GFX940-NEXT: s_movk_i32 s2, 0x7fff
-; GFX940-NEXT: .LBB41_1: ; %atomicrmw.start
-; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_add_f32_e32 v2, v2, v6
-; GFX940-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX940-NEXT: v_add3_u32 v7, v7, v2, s2
-; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX940-NEXT: s_nop 1
-; GFX940-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc
-; GFX940-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX940-NEXT: v_and_or_b32 v2, v3, v5, v2
; GFX940-NEXT: buffer_wbl2 sc0 sc1
-; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 sc1
+; GFX940-NEXT: flat_atomic_pk_add_f16 v[0:1], v2 offset:2044 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_inv sc0 sc1
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT: v_mov_b32_e32 v3, v2
-; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT: s_cbranch_execnz .LBB41_1
-; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX940-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: flat_system_atomic_fadd_noret_bf16__offset12b_pos:
+; GFX11-LABEL: flat_system_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
-; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v0, -4, v4
-; GFX11-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX11-NEXT: flat_load_b32 v4, v[0:1] offset:2044
; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: flat_load_b32 v3, v[0:1]
-; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v4
-; GFX11-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_not_b32_e32 v5, v5
-; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB41_1: ; %atomicrmw.start
+; GFX11-NEXT: .LBB63_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, v4, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-NEXT: v_add_f32_e32 v2, v2, v6
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, v4, v2
-; GFX11-NEXT: v_and_or_b32 v2, v3, v5, v2
+; GFX11-NEXT: v_pk_add_f16 v3, v4, v2
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc
+; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 glc
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-NEXT: v_mov_b32_e32 v3, v2
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-NEXT: v_mov_b32_e32 v4, v3
; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB41_1
+; GFX11-NEXT: s_cbranch_execnz .LBB63_1
; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
-; GFX10-LABEL: flat_system_atomic_fadd_noret_bf16__offset12b_pos:
+; GFX10-LABEL: flat_system_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
+; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fc, v0
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
-; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v2
-; GFX10-NEXT: v_and_b32_e32 v0, -4, v4
-; GFX10-NEXT: v_and_b32_e32 v4, 3, v4
; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: flat_load_dword v3, v[0:1]
-; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v4
-; GFX10-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX10-NEXT: v_not_b32_e32 v5, v5
-; GFX10-NEXT: .LBB41_1: ; %atomicrmw.start
+; GFX10-NEXT: flat_load_dword v4, v[0:1]
+; GFX10-NEXT: .LBB63_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX10-NEXT: v_add_f32_e32 v2, v2, v6
-; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX10-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
-; GFX10-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo
-; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX10-NEXT: v_and_or_b32 v2, v3, v5, v2
+; GFX10-NEXT: v_pk_add_f16 v3, v4, v2
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX10-NEXT: v_mov_b32_e32 v3, v2
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX10-NEXT: v_mov_b32_e32 v4, v3
; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: s_cbranch_execnz .LBB41_1
+; GFX10-NEXT: s_cbranch_execnz .LBB63_1
; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX90A-LABEL: flat_system_atomic_fadd_noret_bf16__offset12b_pos:
+; GFX90A-LABEL: flat_system_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0
-; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX90A-NEXT: v_and_b32_e32 v0, -4, v4
-; GFX90A-NEXT: flat_load_dword v3, v[0:1]
-; GFX90A-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 3, v4
-; GFX90A-NEXT: s_mov_b32 s4, 0xffff
-; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v4, s4
-; GFX90A-NEXT: v_not_b32_e32 v5, v5
+; GFX90A-NEXT: flat_load_dword v5, v[0:1] offset:2044
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v2
-; GFX90A-NEXT: s_movk_i32 s6, 0x7fff
-; GFX90A-NEXT: .LBB41_1: ; %atomicrmw.start
+; GFX90A-NEXT: .LBB63_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX90A-NEXT: v_add_f32_e32 v2, v2, v6
-; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s6
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX90A-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc
-; GFX90A-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX90A-NEXT: v_and_or_b32 v2, v3, v5, v2
+; GFX90A-NEXT: v_pk_add_f16 v4, v5, v2
; GFX90A-NEXT: buffer_wbl2
-; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_invl2
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: v_mov_b32_e32 v3, v2
+; GFX90A-NEXT: v_mov_b32_e32 v5, v3
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: s_cbranch_execnz .LBB41_1
+; GFX90A-NEXT: s_cbranch_execnz .LBB63_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
-; GFX908-LABEL: flat_system_atomic_fadd_noret_bf16__offset12b_pos:
+; GFX908-LABEL: flat_system_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0
-; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX908-NEXT: v_and_b32_e32 v0, -4, v4
-; GFX908-NEXT: flat_load_dword v3, v[0:1]
-; GFX908-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX908-NEXT: v_lshlrev_b32_e32 v4, 3, v4
-; GFX908-NEXT: s_mov_b32 s4, 0xffff
-; GFX908-NEXT: v_lshlrev_b32_e64 v5, v4, s4
-; GFX908-NEXT: v_not_b32_e32 v5, v5
+; GFX908-NEXT: flat_load_dword v4, v[0:1] offset:2044
; GFX908-NEXT: s_mov_b64 s[4:5], 0
-; GFX908-NEXT: v_lshlrev_b32_e32 v6, 16, v2
-; GFX908-NEXT: s_movk_i32 s6, 0x7fff
-; GFX908-NEXT: .LBB41_1: ; %atomicrmw.start
+; GFX908-NEXT: .LBB63_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX908-NEXT: v_add_f32_e32 v2, v2, v6
-; GFX908-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX908-NEXT: v_add3_u32 v7, v7, v2, s6
-; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX908-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc
-; GFX908-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX908-NEXT: v_and_or_b32 v2, v3, v5, v2
-; GFX908-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX908-NEXT: v_pk_add_f16 v3, v4, v2
+; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT: v_mov_b32_e32 v3, v2
+; GFX908-NEXT: v_mov_b32_e32 v4, v3
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT: s_cbranch_execnz .LBB41_1
+; GFX908-NEXT: s_cbranch_execnz .LBB63_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX908-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
-; GFX8-LABEL: flat_system_atomic_fadd_noret_bf16__offset12b_pos:
+; GFX8-LABEL: flat_system_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fe, v0
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: v_and_b32_e32 v0, -4, v4
-; GFX8-NEXT: flat_load_dword v3, v[0:1]
-; GFX8-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX8-NEXT: v_lshlrev_b32_e32 v4, 3, v4
-; GFX8-NEXT: s_mov_b32 s4, 0xffff
-; GFX8-NEXT: v_lshlrev_b32_e64 v5, v4, s4
-; GFX8-NEXT: v_not_b32_e32 v5, v5
+; GFX8-NEXT: flat_load_dword v4, v[0:1]
; GFX8-NEXT: s_mov_b64 s[4:5], 0
-; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v2
-; GFX8-NEXT: .LBB41_1: ; %atomicrmw.start
+; GFX8-NEXT: .LBB63_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT: v_add_f32_e32 v2, v2, v6
-; GFX8-NEXT: v_bfe_u32 v8, v2, 16, 1
-; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v2
-; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8
-; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v2
-; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX8-NEXT: v_cndmask_b32_e32 v2, v8, v9, vcc
-; GFX8-NEXT: v_and_b32_e32 v7, v3, v5
-; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-NEXT: v_or_b32_e32 v2, v7, v2
-; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX8-NEXT: v_add_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_add_f16_e32 v5, v4, v2
+; GFX8-NEXT: v_or_b32_e32 v3, v5, v3
+; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT: v_mov_b32_e32 v3, v2
+; GFX8-NEXT: v_mov_b32_e32 v4, v3
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT: s_cbranch_execnz .LBB41_1
+; GFX8-NEXT: s_cbranch_execnz .LBB63_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX7-LABEL: flat_system_atomic_fadd_noret_bf16__offset12b_pos:
+; GFX7-LABEL: flat_system_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0
+; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7fc, v0
; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX7-NEXT: v_and_b32_e32 v0, -4, v4
-; GFX7-NEXT: flat_load_dword v3, v[0:1]
-; GFX7-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4
-; GFX7-NEXT: v_lshl_b32_e32 v5, 0xffff, v4
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT: v_not_b32_e32 v5, v5
+; GFX7-NEXT: flat_load_dword v5, v[0:1]
+; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2
; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
-; GFX7-NEXT: .LBB41_1: ; %atomicrmw.start
-; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v3
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3
-; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX7-NEXT: v_add_f32_e32 v2, v2, v6
-; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX7-NEXT: v_and_b32_e32 v7, v3, v5
-; GFX7-NEXT: v_lshlrev_b32_e32 v2, v4, v2
-; GFX7-NEXT: v_or_b32_e32 v2, v7, v2
-; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v5
+; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v5
+; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3
+; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6
+; GFX7-NEXT: .LBB63_1: ; %atomicrmw.start
+; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5
+; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4
+; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5
+; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX7-NEXT: v_add_f32_e32 v6, v6, v2
+; GFX7-NEXT: v_add_f32_e32 v7, v7, v3
+; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6
+; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7
+; GFX7-NEXT: v_or_b32_e32 v6, v4, v5
+; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8
+; GFX7-NEXT: v_or_b32_e32 v5, v7, v4
+; GFX7-NEXT: flat_atomic_cmpswap v7, v[0:1], v[5:6] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7
+; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7
+; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT: v_mov_b32_e32 v3, v2
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT: s_cbranch_execnz .LBB41_1
+; GFX7-NEXT: s_cbranch_execnz .LBB63_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_setpc_b64 s[30:31]
- %gep = getelementptr bfloat, ptr %ptr, i64 1023
- %unused = atomicrmw fadd ptr %gep, bfloat %val seq_cst
+ %gep = getelementptr <2 x half>, ptr %ptr, i64 511
+ %unused = atomicrmw fadd ptr %gep, <2 x half> %val seq_cst, !amdgpu.no.fine.grained.memory !0
ret void
}
-; --------------------------------------------------------------------
-; <2 x half>
-; --------------------------------------------------------------------
-
-define <2 x half> @flat_agent_atomic_fadd_ret_v2f16(ptr %ptr, <2 x half> %val) #0 {
-; GFX12-LABEL: flat_agent_atomic_fadd_ret_v2f16:
+define <2 x half> @flat_agent_atomic_fadd_ret_v2f16__amdgpu_no_remote_memory(ptr %ptr, <2 x half> %val) #0 {
+; GFX12-LABEL: flat_agent_atomic_fadd_ret_v2f16__amdgpu_no_remote_memory:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: s_wait_expcnt 0x0
@@ -10329,7 +14579,7 @@ define <2 x half> @flat_agent_atomic_fadd_ret_v2f16(ptr %ptr, <2 x half> %val) #
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
-; GFX940-LABEL: flat_agent_atomic_fadd_ret_v2f16:
+; GFX940-LABEL: flat_agent_atomic_fadd_ret_v2f16__amdgpu_no_remote_memory:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_wbl2 sc1
@@ -10338,12 +14588,12 @@ define <2 x half> @flat_agent_atomic_fadd_ret_v2f16(ptr %ptr, <2 x half> %val) #
; GFX940-NEXT: buffer_inv sc1
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: flat_agent_atomic_fadd_ret_v2f16:
+; GFX11-LABEL: flat_agent_atomic_fadd_ret_v2f16__amdgpu_no_remote_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: flat_load_b32 v3, v[0:1]
; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: .LBB42_1: ; %atomicrmw.start
+; GFX11-NEXT: .LBB64_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v4, v3
@@ -10358,18 +14608,18 @@ define <2 x half> @flat_agent_atomic_fadd_ret_v2f16(ptr %ptr, <2 x half> %val) #
; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB42_1
+; GFX11-NEXT: s_cbranch_execnz .LBB64_1
; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: v_mov_b32_e32 v0, v3
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
-; GFX10-LABEL: flat_agent_atomic_fadd_ret_v2f16:
+; GFX10-LABEL: flat_agent_atomic_fadd_ret_v2f16__amdgpu_no_remote_memory:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: flat_load_dword v3, v[0:1]
; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: .LBB42_1: ; %atomicrmw.start
+; GFX10-NEXT: .LBB64_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v4, v3
@@ -10382,18 +14632,18 @@ define <2 x half> @flat_agent_atomic_fadd_ret_v2f16(ptr %ptr, <2 x half> %val) #
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: s_cbranch_execnz .LBB42_1
+; GFX10-NEXT: s_cbranch_execnz .LBB64_1
; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: v_mov_b32_e32 v0, v3
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX90A-LABEL: flat_agent_atomic_fadd_ret_v2f16:
+; GFX90A-LABEL: flat_agent_atomic_fadd_ret_v2f16__amdgpu_no_remote_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: flat_load_dword v3, v[0:1]
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: .LBB42_1: ; %atomicrmw.start
+; GFX90A-NEXT: .LBB64_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v5, v3
@@ -10404,18 +14654,18 @@ define <2 x half> @flat_agent_atomic_fadd_ret_v2f16(ptr %ptr, <2 x half> %val) #
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: s_cbranch_execnz .LBB42_1
+; GFX90A-NEXT: s_cbranch_execnz .LBB64_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: v_mov_b32_e32 v0, v3
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
-; GFX908-LABEL: flat_agent_atomic_fadd_ret_v2f16:
+; GFX908-LABEL: flat_agent_atomic_fadd_ret_v2f16__amdgpu_no_remote_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: flat_load_dword v3, v[0:1]
; GFX908-NEXT: s_mov_b64 s[4:5], 0
-; GFX908-NEXT: .LBB42_1: ; %atomicrmw.start
+; GFX908-NEXT: .LBB64_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX908-NEXT: v_mov_b32_e32 v4, v3
@@ -10426,18 +14676,18 @@ define <2 x half> @flat_agent_atomic_fadd_ret_v2f16(ptr %ptr, <2 x half> %val) #
; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT: s_cbranch_execnz .LBB42_1
+; GFX908-NEXT: s_cbranch_execnz .LBB64_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX908-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX908-NEXT: v_mov_b32_e32 v0, v3
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
-; GFX8-LABEL: flat_agent_atomic_fadd_ret_v2f16:
+; GFX8-LABEL: flat_agent_atomic_fadd_ret_v2f16__amdgpu_no_remote_memory:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: flat_load_dword v3, v[0:1]
; GFX8-NEXT: s_mov_b64 s[4:5], 0
-; GFX8-NEXT: .LBB42_1: ; %atomicrmw.start
+; GFX8-NEXT: .LBB64_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v4, v3
@@ -10450,13 +14700,13 @@ define <2 x half> @flat_agent_atomic_fadd_ret_v2f16(ptr %ptr, <2 x half> %val) #
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT: s_cbranch_execnz .LBB42_1
+; GFX8-NEXT: s_cbranch_execnz .LBB64_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX8-NEXT: v_mov_b32_e32 v0, v3
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX7-LABEL: flat_agent_atomic_fadd_ret_v2f16:
+; GFX7-LABEL: flat_agent_atomic_fadd_ret_v2f16__amdgpu_no_remote_memory:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: flat_load_dword v5, v[0:1]
@@ -10469,7 +14719,7 @@ define <2 x half> @flat_agent_atomic_fadd_ret_v2f16(ptr %ptr, <2 x half> %val) #
; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v5
; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3
; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v6
-; GFX7-NEXT: .LBB42_1: ; %atomicrmw.start
+; GFX7-NEXT: .LBB64_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
@@ -10492,18 +14742,18 @@ define <2 x half> @flat_agent_atomic_fadd_ret_v2f16(ptr %ptr, <2 x half> %val) #
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v7
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT: s_cbranch_execnz .LBB42_1
+; GFX7-NEXT: s_cbranch_execnz .LBB64_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX7-NEXT: v_mov_b32_e32 v0, v2
; GFX7-NEXT: v_mov_b32_e32 v1, v3
; GFX7-NEXT: s_setpc_b64 s[30:31]
- %result = atomicrmw fadd ptr %ptr, <2 x half> %val syncscope("agent") seq_cst
+ %result = atomicrmw fadd ptr %ptr, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
ret <2 x half> %result
}
-define <2 x half> @flat_agent_atomic_fadd_ret_v2f16__offset12b_pos(ptr %ptr, <2 x half> %val) #0 {
-; GFX12-LABEL: flat_agent_atomic_fadd_ret_v2f16__offset12b_pos:
+define void @flat_agent_atomic_fadd_noret_v2f16__amdgpu_no_remote_memory(ptr %ptr, <2 x half> %val) #0 {
+; GFX12-LABEL: flat_agent_atomic_fadd_noret_v2f16__amdgpu_no_remote_memory:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: s_wait_expcnt 0x0
@@ -10511,189 +14761,178 @@ define <2 x half> @flat_agent_atomic_fadd_ret_v2f16__offset12b_pos(ptr %ptr, <2
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_pk_add_f16 v0, v[0:1], v2 offset:2044 th:TH_ATOMIC_RETURN
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: flat_atomic_pk_add_f16 v[0:1], v2
+; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
-; GFX940-LABEL: flat_agent_atomic_fadd_ret_v2f16__offset12b_pos:
+; GFX940-LABEL: flat_agent_atomic_fadd_noret_v2f16__amdgpu_no_remote_memory:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: flat_atomic_pk_add_f16 v0, v[0:1], v2 offset:2044 sc0
+; GFX940-NEXT: flat_atomic_pk_add_f16 v[0:1], v2
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_inv sc1
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: flat_agent_atomic_fadd_ret_v2f16__offset12b_pos:
+; GFX11-LABEL: flat_agent_atomic_fadd_noret_v2f16__amdgpu_no_remote_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2044
+; GFX11-NEXT: flat_load_b32 v4, v[0:1]
; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: .LBB43_1: ; %atomicrmw.start
+; GFX11-NEXT: .LBB65_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v4, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_pk_add_f16 v3, v4, v2
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 glc
+; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-NEXT: v_mov_b32_e32 v4, v3
; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB43_1
+; GFX11-NEXT: s_cbranch_execnz .LBB65_1
; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: v_mov_b32_e32 v0, v3
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
-; GFX10-LABEL: flat_agent_atomic_fadd_ret_v2f16__offset12b_pos:
+; GFX10-LABEL: flat_agent_atomic_fadd_noret_v2f16__amdgpu_no_remote_memory:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fc, v0
-; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, 0, v1, vcc_lo
+; GFX10-NEXT: flat_load_dword v4, v[0:1]
; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: flat_load_dword v0, v[3:4]
-; GFX10-NEXT: .LBB43_1: ; %atomicrmw.start
+; GFX10-NEXT: .LBB65_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v1, v0
-; GFX10-NEXT: v_pk_add_f16 v0, v1, v2
+; GFX10-NEXT: v_pk_add_f16 v3, v4, v2
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
+; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX10-NEXT: v_mov_b32_e32 v4, v3
; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: s_cbranch_execnz .LBB43_1
+; GFX10-NEXT: s_cbranch_execnz .LBB65_1
; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX90A-LABEL: flat_agent_atomic_fadd_ret_v2f16__offset12b_pos:
+; GFX90A-LABEL: flat_agent_atomic_fadd_noret_v2f16__amdgpu_no_remote_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044
+; GFX90A-NEXT: flat_load_dword v5, v[0:1]
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: .LBB43_1: ; %atomicrmw.start
+; GFX90A-NEXT: .LBB65_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v5, v3
; GFX90A-NEXT: v_pk_add_f16 v4, v5, v2
-; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc
+; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v5, v3
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: s_cbranch_execnz .LBB43_1
+; GFX90A-NEXT: s_cbranch_execnz .LBB65_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: v_mov_b32_e32 v0, v3
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
-; GFX908-LABEL: flat_agent_atomic_fadd_ret_v2f16__offset12b_pos:
+; GFX908-LABEL: flat_agent_atomic_fadd_noret_v2f16__amdgpu_no_remote_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044
+; GFX908-NEXT: flat_load_dword v4, v[0:1]
; GFX908-NEXT: s_mov_b64 s[4:5], 0
-; GFX908-NEXT: .LBB43_1: ; %atomicrmw.start
+; GFX908-NEXT: .LBB65_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v4, v3
; GFX908-NEXT: v_pk_add_f16 v3, v4, v2
-; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc
+; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT: v_mov_b32_e32 v4, v3
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT: s_cbranch_execnz .LBB43_1
+; GFX908-NEXT: s_cbranch_execnz .LBB65_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX908-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX908-NEXT: v_mov_b32_e32 v0, v3
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
-; GFX8-LABEL: flat_agent_atomic_fadd_ret_v2f16__offset12b_pos:
+; GFX8-LABEL: flat_agent_atomic_fadd_noret_v2f16__amdgpu_no_remote_memory:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fc, v0
-; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_load_dword v0, v[3:4]
+; GFX8-NEXT: flat_load_dword v4, v[0:1]
; GFX8-NEXT: s_mov_b64 s[4:5], 0
-; GFX8-NEXT: .LBB43_1: ; %atomicrmw.start
+; GFX8-NEXT: .LBB65_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, v0
-; GFX8-NEXT: v_add_f16_sdwa v0, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_add_f16_e32 v5, v1, v2
-; GFX8-NEXT: v_or_b32_e32 v0, v5, v0
-; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
+; GFX8-NEXT: v_add_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_add_f16_e32 v5, v4, v2
+; GFX8-NEXT: v_or_b32_e32 v3, v5, v3
+; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT: v_mov_b32_e32 v4, v3
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT: s_cbranch_execnz .LBB43_1
+; GFX8-NEXT: s_cbranch_execnz .LBB65_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX7-LABEL: flat_agent_atomic_fadd_ret_v2f16__offset12b_pos:
+; GFX7-LABEL: flat_agent_atomic_fadd_noret_v2f16__amdgpu_no_remote_memory:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x7fc, v0
-; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
-; GFX7-NEXT: flat_load_dword v1, v[4:5]
-; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v3
-; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v2
+; GFX7-NEXT: flat_load_dword v5, v[0:1]
+; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2
; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v0
-; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3
+; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v3
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v1
-; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX7-NEXT: .LBB43_1: ; %atomicrmw.start
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v5
+; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v5
+; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3
+; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6
+; GFX7-NEXT: .LBB65_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v1
-; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v0
-; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5
+; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4
+; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5
+; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; GFX7-NEXT: v_add_f32_e32 v6, v6, v2
; GFX7-NEXT: v_add_f32_e32 v7, v7, v3
-; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6
-; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7
-; GFX7-NEXT: v_or_b32_e32 v7, v0, v1
-; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v6
-; GFX7-NEXT: v_or_b32_e32 v6, v8, v0
-; GFX7-NEXT: flat_atomic_cmpswap v6, v[4:5], v[6:7] glc
+; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6
+; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7
+; GFX7-NEXT: v_or_b32_e32 v6, v4, v5
+; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8
+; GFX7-NEXT: v_or_b32_e32 v5, v7, v4
+; GFX7-NEXT: flat_atomic_cmpswap v7, v[0:1], v[5:6] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v6
-; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v6
-; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v7
+; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7
+; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7
+; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT: s_cbranch_execnz .LBB43_1
+; GFX7-NEXT: s_cbranch_execnz .LBB65_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_setpc_b64 s[30:31]
- %gep = getelementptr <2 x half>, ptr %ptr, i64 511
- %result = atomicrmw fadd ptr %gep, <2 x half> %val syncscope("agent") seq_cst
- ret <2 x half> %result
+ %unused = atomicrmw fadd ptr %ptr, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
+ ret void
}
-define <2 x half> @flat_agent_atomic_fadd_ret_v2f16__offset12b_neg(ptr %ptr, <2 x half> %val) #0 {
-; GFX12-LABEL: flat_agent_atomic_fadd_ret_v2f16__offset12b_neg:
+define <2 x half> @flat_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr %ptr, <2 x half> %val) #0 {
+; GFX12-LABEL: flat_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: s_wait_expcnt 0x0
@@ -10701,203 +14940,186 @@ define <2 x half> @flat_agent_atomic_fadd_ret_v2f16__offset12b_neg(ptr %ptr, <2
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_pk_add_f16 v0, v[0:1], v2 offset:-2048 th:TH_ATOMIC_RETURN
+; GFX12-NEXT: flat_atomic_pk_add_f16 v0, v[0:1], v2 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
-; GFX940-LABEL: flat_agent_atomic_fadd_ret_v2f16__offset12b_neg:
+; GFX940-LABEL: flat_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
-; GFX940-NEXT: s_nop 1
-; GFX940-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
; GFX940-NEXT: buffer_wbl2 sc1
; GFX940-NEXT: flat_atomic_pk_add_f16 v0, v[0:1], v2 sc0
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_inv sc1
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: flat_agent_atomic_fadd_ret_v2f16__offset12b_neg:
+; GFX11-LABEL: flat_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v3, v0
+; GFX11-NEXT: flat_load_b32 v3, v[0:1]
; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v3
-; GFX11-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, -1, v1, vcc_lo
-; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v3
-; GFX11-NEXT: flat_load_b32 v0, v[4:5]
-; GFX11-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, -1, v1, vcc_lo
-; GFX11-NEXT: .LBB44_1: ; %atomicrmw.start
+; GFX11-NEXT: .LBB66_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v1, v0
+; GFX11-NEXT: v_mov_b32_e32 v4, v3
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_pk_add_f16 v0, v1, v2
+; GFX11-NEXT: v_pk_add_f16 v3, v4, v2
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v0, v[3:4], v[0:1] glc
+; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB44_1
+; GFX11-NEXT: s_cbranch_execnz .LBB66_1
; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT: v_mov_b32_e32 v0, v3
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
-; GFX10-LABEL: flat_agent_atomic_fadd_ret_v2f16__offset12b_neg:
+; GFX10-LABEL: flat_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
-; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, -1, v1, vcc_lo
+; GFX10-NEXT: flat_load_dword v3, v[0:1]
; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: flat_load_dword v0, v[3:4]
-; GFX10-NEXT: .LBB44_1: ; %atomicrmw.start
+; GFX10-NEXT: .LBB66_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v1, v0
-; GFX10-NEXT: v_pk_add_f16 v0, v1, v2
+; GFX10-NEXT: v_mov_b32_e32 v4, v3
+; GFX10-NEXT: v_pk_add_f16 v3, v4, v2
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
+; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: s_cbranch_execnz .LBB44_1
+; GFX10-NEXT: s_cbranch_execnz .LBB66_1
; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT: v_mov_b32_e32 v0, v3
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX90A-LABEL: flat_agent_atomic_fadd_ret_v2f16__offset12b_neg:
+; GFX90A-LABEL: flat_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
-; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc
-; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
-; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
-; GFX90A-NEXT: flat_load_dword v0, v[0:1]
+; GFX90A-NEXT: flat_load_dword v3, v[0:1]
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: .LBB44_1: ; %atomicrmw.start
+; GFX90A-NEXT: .LBB66_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v1, v0
-; GFX90A-NEXT: v_pk_add_f16 v0, v1, v2
-; GFX90A-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] glc
+; GFX90A-NEXT: v_mov_b32_e32 v5, v3
+; GFX90A-NEXT: v_pk_add_f16 v4, v5, v2
+; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: s_cbranch_execnz .LBB44_1
+; GFX90A-NEXT: s_cbranch_execnz .LBB66_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v3
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
-; GFX908-LABEL: flat_agent_atomic_fadd_ret_v2f16__offset12b_neg:
+; GFX908-LABEL: flat_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
-; GFX908-NEXT: v_addc_co_u32_e32 v4, vcc, -1, v1, vcc
-; GFX908-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
-; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
-; GFX908-NEXT: flat_load_dword v0, v[0:1]
+; GFX908-NEXT: flat_load_dword v3, v[0:1]
; GFX908-NEXT: s_mov_b64 s[4:5], 0
-; GFX908-NEXT: .LBB44_1: ; %atomicrmw.start
+; GFX908-NEXT: .LBB66_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v1, v0
-; GFX908-NEXT: v_pk_add_f16 v0, v1, v2
-; GFX908-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
+; GFX908-NEXT: v_mov_b32_e32 v4, v3
+; GFX908-NEXT: v_pk_add_f16 v3, v4, v2
+; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT: s_cbranch_execnz .LBB44_1
+; GFX908-NEXT: s_cbranch_execnz .LBB66_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX908-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT: v_mov_b32_e32 v0, v3
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
-; GFX8-LABEL: flat_agent_atomic_fadd_ret_v2f16__offset12b_neg:
+; GFX8-LABEL: flat_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0xfffff800, v0
-; GFX8-NEXT: v_addc_u32_e32 v4, vcc, -1, v1, vcc
-; GFX8-NEXT: flat_load_dword v0, v[3:4]
+; GFX8-NEXT: flat_load_dword v3, v[0:1]
; GFX8-NEXT: s_mov_b64 s[4:5], 0
-; GFX8-NEXT: .LBB44_1: ; %atomicrmw.start
+; GFX8-NEXT: .LBB66_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, v0
-; GFX8-NEXT: v_add_f16_sdwa v0, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_add_f16_e32 v5, v1, v2
-; GFX8-NEXT: v_or_b32_e32 v0, v5, v0
-; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
+; GFX8-NEXT: v_mov_b32_e32 v4, v3
+; GFX8-NEXT: v_add_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_add_f16_e32 v5, v4, v2
+; GFX8-NEXT: v_or_b32_e32 v3, v5, v3
+; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT: s_cbranch_execnz .LBB44_1
+; GFX8-NEXT: s_cbranch_execnz .LBB66_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT: v_mov_b32_e32 v0, v3
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX7-LABEL: flat_agent_atomic_fadd_ret_v2f16__offset12b_neg:
+; GFX7-LABEL: flat_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0xfffff800, v0
-; GFX7-NEXT: v_addc_u32_e32 v5, vcc, -1, v1, vcc
-; GFX7-NEXT: flat_load_dword v1, v[4:5]
-; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v3
-; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v2
+; GFX7-NEXT: flat_load_dword v5, v[0:1]
+; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2
; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v0
-; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3
+; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v1
-; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX7-NEXT: .LBB44_1: ; %atomicrmw.start
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v5
+; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v5
+; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3
+; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v6
+; GFX7-NEXT: .LBB66_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v1
-; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v0
-; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX7-NEXT: v_add_f32_e32 v6, v6, v2
-; GFX7-NEXT: v_add_f32_e32 v7, v7, v3
+; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v3
+; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v2
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_add_f32_e32 v6, v6, v4
+; GFX7-NEXT: v_add_f32_e32 v7, v7, v5
; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6
; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7
-; GFX7-NEXT: v_or_b32_e32 v7, v0, v1
-; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v6
-; GFX7-NEXT: v_or_b32_e32 v6, v8, v0
-; GFX7-NEXT: flat_atomic_cmpswap v6, v[4:5], v[6:7] glc
+; GFX7-NEXT: v_or_b32_e32 v7, v2, v3
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v6
+; GFX7-NEXT: v_or_b32_e32 v6, v8, v2
+; GFX7-NEXT: flat_atomic_cmpswap v6, v[0:1], v[6:7] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v6
-; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v6
-; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v6
+; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v6
+; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v7
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT: s_cbranch_execnz .LBB44_1
+; GFX7-NEXT: s_cbranch_execnz .LBB66_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX7-NEXT: v_mov_b32_e32 v0, v2
+; GFX7-NEXT: v_mov_b32_e32 v1, v3
; GFX7-NEXT: s_setpc_b64 s[30:31]
- %gep = getelementptr <2 x half>, ptr %ptr, i64 -512
- %result = atomicrmw fadd ptr %gep, <2 x half> %val syncscope("agent") seq_cst
+ %result = atomicrmw fadd ptr %ptr, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
ret <2 x half> %result
}
-define void @flat_agent_atomic_fadd_noret_v2f16(ptr %ptr, <2 x half> %val) #0 {
-; GFX12-LABEL: flat_agent_atomic_fadd_noret_v2f16:
+define void @flat_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr %ptr, <2 x half> %val) #0 {
+; GFX12-LABEL: flat_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: s_wait_expcnt 0x0
@@ -10910,7 +15132,7 @@ define void @flat_agent_atomic_fadd_noret_v2f16(ptr %ptr, <2 x half> %val) #0 {
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
-; GFX940-LABEL: flat_agent_atomic_fadd_noret_v2f16:
+; GFX940-LABEL: flat_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_wbl2 sc1
@@ -10919,12 +15141,12 @@ define void @flat_agent_atomic_fadd_noret_v2f16(ptr %ptr, <2 x half> %val) #0 {
; GFX940-NEXT: buffer_inv sc1
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: flat_agent_atomic_fadd_noret_v2f16:
+; GFX11-LABEL: flat_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: flat_load_b32 v4, v[0:1]
; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: .LBB45_1: ; %atomicrmw.start
+; GFX11-NEXT: .LBB67_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_pk_add_f16 v3, v4, v2
@@ -10938,17 +15160,17 @@ define void @flat_agent_atomic_fadd_noret_v2f16(ptr %ptr, <2 x half> %val) #0 {
; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB45_1
+; GFX11-NEXT: s_cbranch_execnz .LBB67_1
; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
-; GFX10-LABEL: flat_agent_atomic_fadd_noret_v2f16:
+; GFX10-LABEL: flat_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: flat_load_dword v4, v[0:1]
; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: .LBB45_1: ; %atomicrmw.start
+; GFX10-NEXT: .LBB67_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_pk_add_f16 v3, v4, v2
@@ -10961,17 +15183,17 @@ define void @flat_agent_atomic_fadd_noret_v2f16(ptr %ptr, <2 x half> %val) #0 {
; GFX10-NEXT: v_mov_b32_e32 v4, v3
; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: s_cbranch_execnz .LBB45_1
+; GFX10-NEXT: s_cbranch_execnz .LBB67_1
; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX90A-LABEL: flat_agent_atomic_fadd_noret_v2f16:
+; GFX90A-LABEL: flat_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: flat_load_dword v5, v[0:1]
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: .LBB45_1: ; %atomicrmw.start
+; GFX90A-NEXT: .LBB67_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_pk_add_f16 v4, v5, v2
@@ -10982,17 +15204,17 @@ define void @flat_agent_atomic_fadd_noret_v2f16(ptr %ptr, <2 x half> %val) #0 {
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: v_mov_b32_e32 v5, v3
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: s_cbranch_execnz .LBB45_1
+; GFX90A-NEXT: s_cbranch_execnz .LBB67_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
-; GFX908-LABEL: flat_agent_atomic_fadd_noret_v2f16:
+; GFX908-LABEL: flat_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: flat_load_dword v4, v[0:1]
; GFX908-NEXT: s_mov_b64 s[4:5], 0
-; GFX908-NEXT: .LBB45_1: ; %atomicrmw.start
+; GFX908-NEXT: .LBB67_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX908-NEXT: v_pk_add_f16 v3, v4, v2
@@ -11003,17 +15225,17 @@ define void @flat_agent_atomic_fadd_noret_v2f16(ptr %ptr, <2 x half> %val) #0 {
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX908-NEXT: v_mov_b32_e32 v4, v3
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT: s_cbranch_execnz .LBB45_1
+; GFX908-NEXT: s_cbranch_execnz .LBB67_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX908-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
-; GFX8-LABEL: flat_agent_atomic_fadd_noret_v2f16:
+; GFX8-LABEL: flat_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: flat_load_dword v4, v[0:1]
; GFX8-NEXT: s_mov_b64 s[4:5], 0
-; GFX8-NEXT: .LBB45_1: ; %atomicrmw.start
+; GFX8-NEXT: .LBB67_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_add_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
@@ -11026,12 +15248,12 @@ define void @flat_agent_atomic_fadd_noret_v2f16(ptr %ptr, <2 x half> %val) #0 {
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX8-NEXT: v_mov_b32_e32 v4, v3
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT: s_cbranch_execnz .LBB45_1
+; GFX8-NEXT: s_cbranch_execnz .LBB67_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX7-LABEL: flat_agent_atomic_fadd_noret_v2f16:
+; GFX7-LABEL: flat_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: flat_load_dword v5, v[0:1]
@@ -11044,7 +15266,7 @@ define void @flat_agent_atomic_fadd_noret_v2f16(ptr %ptr, <2 x half> %val) #0 {
; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v5
; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3
; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6
-; GFX7-NEXT: .LBB45_1: ; %atomicrmw.start
+; GFX7-NEXT: .LBB67_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5
; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4
@@ -11067,16 +15289,20 @@ define void @flat_agent_atomic_fadd_noret_v2f16(ptr %ptr, <2 x half> %val) #0 {
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT: s_cbranch_execnz .LBB45_1
+; GFX7-NEXT: s_cbranch_execnz .LBB67_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_setpc_b64 s[30:31]
- %unused = atomicrmw fadd ptr %ptr, <2 x half> %val syncscope("agent") seq_cst
+ %unused = atomicrmw fadd ptr %ptr, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
ret void
}
-define void @flat_agent_atomic_fadd_noret_v2f16__offset12b_pos(ptr %ptr, <2 x half> %val) #0 {
-; GFX12-LABEL: flat_agent_atomic_fadd_noret_v2f16__offset12b_pos:
+; --------------------------------------------------------------------
+; <2 x bfloat>
+; --------------------------------------------------------------------
+
+define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_memory(ptr %ptr, <2 x bfloat> %val) #0 {
+; GFX12-LABEL: flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_memory:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: s_wait_expcnt 0x0
@@ -11084,185 +15310,274 @@ define void @flat_agent_atomic_fadd_noret_v2f16__offset12b_pos(ptr %ptr, <2 x ha
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_pk_add_f16 v[0:1], v2 offset:2044
-; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX12-NEXT: flat_atomic_pk_add_bf16 v0, v[0:1], v2 th:TH_ATOMIC_RETURN
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
-; GFX940-LABEL: flat_agent_atomic_fadd_noret_v2f16__offset12b_pos:
+; GFX940-LABEL: flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_memory:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: flat_atomic_pk_add_f16 v[0:1], v2 offset:2044
+; GFX940-NEXT: flat_atomic_pk_add_bf16 v0, v[0:1], v2 sc0
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_inv sc1
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: flat_agent_atomic_fadd_noret_v2f16__offset12b_pos:
+; GFX11-LABEL: flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: flat_load_b32 v4, v[0:1] offset:2044
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: .LBB46_1: ; %atomicrmw.start
+; GFX11-NEXT: flat_load_b32 v3, v[0:1]
+; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11-NEXT: s_mov_b32 s1, 0
+; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX11-NEXT: .p2align 6
+; GFX11-NEXT: .LBB68_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_pk_add_f16 v3, v4, v2
+; GFX11-NEXT: v_mov_b32_e32 v6, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX11-NEXT: v_add_f32_e32 v5, v5, v2
+; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX11-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
+; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX11-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0
+; GFX11-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 glc
+; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] glc
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT: v_mov_b32_e32 v4, v3
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
+; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB46_1
+; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX11-NEXT: s_cbranch_execnz .LBB68_1
; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX11-NEXT: v_mov_b32_e32 v0, v3
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
-; GFX10-LABEL: flat_agent_atomic_fadd_noret_v2f16__offset12b_pos:
+; GFX10-LABEL: flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fc, v0
-; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
-; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: flat_load_dword v4, v[0:1]
-; GFX10-NEXT: .LBB46_1: ; %atomicrmw.start
+; GFX10-NEXT: flat_load_dword v3, v[0:1]
+; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX10-NEXT: s_mov_b32 s5, 0
+; GFX10-NEXT: .LBB68_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_pk_add_f16 v3, v4, v2
+; GFX10-NEXT: v_mov_b32_e32 v6, v3
+; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX10-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX10-NEXT: v_add_f32_e32 v5, v5, v2
+; GFX10-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX10-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX10-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX10-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
+; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3
+; GFX10-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e64 v3, v7, v9, s4
+; GFX10-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[5:6] glc
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX10-NEXT: v_mov_b32_e32 v4, v3
-; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: s_cbranch_execnz .LBB46_1
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
+; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5
+; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5
+; GFX10-NEXT: s_cbranch_execnz .LBB68_1
; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5
+; GFX10-NEXT: v_mov_b32_e32 v0, v3
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX90A-LABEL: flat_agent_atomic_fadd_noret_v2f16__offset12b_pos:
+; GFX90A-LABEL: flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: flat_load_dword v5, v[0:1] offset:2044
-; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: .LBB46_1: ; %atomicrmw.start
+; GFX90A-NEXT: flat_load_dword v3, v[0:1]
+; GFX90A-NEXT: s_mov_b64 s[6:7], 0
+; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
+; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
+; GFX90A-NEXT: .LBB68_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_pk_add_f16 v4, v5, v2
-; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc
+; GFX90A-NEXT: v_mov_b32_e32 v7, v3
+; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7
+; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7
+; GFX90A-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX90A-NEXT: v_add_f32_e32 v5, v5, v2
+; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1
+; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3
+; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8
+; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
+; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5]
+; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9
+; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[6:7] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
-; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: v_mov_b32_e32 v5, v3
-; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: s_cbranch_execnz .LBB46_1
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7
+; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
+; GFX90A-NEXT: s_cbranch_execnz .LBB68_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v3
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
-; GFX908-LABEL: flat_agent_atomic_fadd_noret_v2f16__offset12b_pos:
+; GFX908-LABEL: flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: flat_load_dword v4, v[0:1] offset:2044
-; GFX908-NEXT: s_mov_b64 s[4:5], 0
-; GFX908-NEXT: .LBB46_1: ; %atomicrmw.start
+; GFX908-NEXT: flat_load_dword v3, v[0:1]
+; GFX908-NEXT: s_mov_b64 s[6:7], 0
+; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX908-NEXT: s_movk_i32 s8, 0x7fff
+; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX908-NEXT: s_mov_b32 s9, 0x7060302
+; GFX908-NEXT: .LBB68_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_pk_add_f16 v3, v4, v2
-; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc
+; GFX908-NEXT: v_mov_b32_e32 v6, v3
+; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX908-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX908-NEXT: v_add_f32_e32 v5, v5, v2
+; GFX908-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX908-NEXT: v_bfe_u32 v9, v5, 16, 1
+; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v3
+; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX908-NEXT: v_add3_u32 v7, v7, v3, s8
+; GFX908-NEXT: v_add3_u32 v9, v9, v5, s8
+; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
+; GFX908-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5]
+; GFX908-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX908-NEXT: v_perm_b32 v5, v5, v3, s9
+; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[5:6] glc
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
-; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT: v_mov_b32_e32 v4, v3
-; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT: s_cbranch_execnz .LBB46_1
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6
+; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7]
+; GFX908-NEXT: s_cbranch_execnz .LBB68_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX908-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX908-NEXT: v_mov_b32_e32 v0, v3
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
-; GFX8-LABEL: flat_agent_atomic_fadd_noret_v2f16__offset12b_pos:
+; GFX8-LABEL: flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_memory:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0
-; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_load_dword v4, v[0:1]
-; GFX8-NEXT: s_mov_b64 s[4:5], 0
-; GFX8-NEXT: .LBB46_1: ; %atomicrmw.start
+; GFX8-NEXT: flat_load_dword v3, v[0:1]
+; GFX8-NEXT: s_mov_b64 s[6:7], 0
+; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX8-NEXT: .LBB68_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_add_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_add_f16_e32 v5, v4, v2
-; GFX8-NEXT: v_or_b32_e32 v3, v5, v3
-; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT: v_mov_b32_e32 v6, v3
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX8-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX8-NEXT: v_add_f32_e32 v5, v5, v2
+; GFX8-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v3
+; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7
+; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9
+; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v3
+; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
+; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5]
+; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX8-NEXT: v_alignbit_b32 v5, v5, v3, 16
+; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[5:6] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
-; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT: v_mov_b32_e32 v4, v3
-; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT: s_cbranch_execnz .LBB46_1
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6
+; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7]
+; GFX8-NEXT: s_cbranch_execnz .LBB68_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX8-NEXT: v_mov_b32_e32 v0, v3
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX7-LABEL: flat_agent_atomic_fadd_noret_v2f16__offset12b_pos:
+; GFX7-LABEL: flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_memory:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7fc, v0
-; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX7-NEXT: flat_load_dword v5, v[0:1]
-; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3
-; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2
+; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v3
; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v3
+; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v5
-; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v5
-; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3
-; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6
-; GFX7-NEXT: .LBB46_1: ; %atomicrmw.start
+; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v5
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5
+; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX7-NEXT: .LBB68_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5
-; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4
-; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5
-; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4
-; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX7-NEXT: v_add_f32_e32 v6, v6, v2
-; GFX7-NEXT: v_add_f32_e32 v7, v7, v3
-; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6
-; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7
-; GFX7-NEXT: v_or_b32_e32 v6, v4, v5
-; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8
-; GFX7-NEXT: v_or_b32_e32 v5, v7, v4
-; GFX7-NEXT: flat_atomic_cmpswap v7, v[0:1], v[5:6] glc
+; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v2
+; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX7-NEXT: v_add_f32_e32 v7, v7, v5
+; GFX7-NEXT: v_add_f32_e32 v6, v6, v4
+; GFX7-NEXT: v_alignbit_b32 v3, v2, v3, 16
+; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v7
+; GFX7-NEXT: v_alignbit_b32 v2, v2, v6, 16
+; GFX7-NEXT: flat_atomic_cmpswap v6, v[0:1], v[2:3] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7
-; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7
-; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v3
+; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v6
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v6
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT: s_cbranch_execnz .LBB46_1
+; GFX7-NEXT: s_cbranch_execnz .LBB68_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX7-NEXT: v_mov_b32_e32 v0, v3
+; GFX7-NEXT: v_mov_b32_e32 v1, v2
; GFX7-NEXT: s_setpc_b64 s[30:31]
- %gep = getelementptr <2 x half>, ptr %ptr, i64 511
- %unused = atomicrmw fadd ptr %gep, <2 x half> %val syncscope("agent") seq_cst
- ret void
+ %result = atomicrmw fadd ptr %ptr, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+ ret <2 x bfloat> %result
}
-define void @flat_agent_atomic_fadd_noret_v2f16__offset12b_neg(ptr %ptr, <2 x half> %val) #0 {
-; GFX12-LABEL: flat_agent_atomic_fadd_noret_v2f16__offset12b_neg:
+define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr %ptr, <2 x bfloat> %val) #0 {
+; GFX12-LABEL: flat_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: s_wait_expcnt 0x0
@@ -11270,200 +15585,277 @@ define void @flat_agent_atomic_fadd_noret_v2f16__offset12b_neg(ptr %ptr, <2 x ha
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_pk_add_f16 v[0:1], v2 offset:-2048
-; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX12-NEXT: flat_atomic_pk_add_bf16 v0, v[0:1], v2 offset:2044 th:TH_ATOMIC_RETURN
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
-; GFX940-LABEL: flat_agent_atomic_fadd_noret_v2f16__offset12b_neg:
+; GFX940-LABEL: flat_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
-; GFX940-NEXT: s_nop 1
-; GFX940-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: flat_atomic_pk_add_f16 v[0:1], v2
+; GFX940-NEXT: flat_atomic_pk_add_bf16 v0, v[0:1], v2 offset:2044 sc0
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_inv sc1
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: flat_agent_atomic_fadd_noret_v2f16__offset12b_neg:
+; GFX11-LABEL: flat_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
-; GFX11-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, -1, v1, vcc_lo
-; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
-; GFX11-NEXT: flat_load_b32 v4, v[3:4]
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: .LBB47_1: ; %atomicrmw.start
+; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2044
+; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11-NEXT: s_mov_b32 s1, 0
+; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX11-NEXT: .p2align 6
+; GFX11-NEXT: .LBB69_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_pk_add_f16 v3, v4, v2
+; GFX11-NEXT: v_mov_b32_e32 v6, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX11-NEXT: v_add_f32_e32 v5, v5, v2
+; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX11-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
+; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX11-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0
+; GFX11-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc
+; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:2044 glc
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT: v_mov_b32_e32 v4, v3
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
+; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB47_1
+; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX11-NEXT: s_cbranch_execnz .LBB69_1
; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX11-NEXT: v_mov_b32_e32 v0, v3
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
-; GFX10-LABEL: flat_agent_atomic_fadd_noret_v2f16__offset12b_neg:
+; GFX10-LABEL: flat_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
-; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
-; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: flat_load_dword v4, v[0:1]
-; GFX10-NEXT: .LBB47_1: ; %atomicrmw.start
+; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fc, v0
+; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, 0, v1, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v2
+; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX10-NEXT: s_mov_b32 s5, 0
+; GFX10-NEXT: flat_load_dword v0, v[3:4]
+; GFX10-NEXT: .LBB69_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_pk_add_f16 v3, v4, v2
+; GFX10-NEXT: v_mov_b32_e32 v6, v0
+; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v6
+; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX10-NEXT: v_add_f32_e32 v0, v0, v1
+; GFX10-NEXT: v_add_f32_e32 v5, v5, v2
+; GFX10-NEXT: v_bfe_u32 v7, v0, 16, 1
+; GFX10-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v0
+; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX10-NEXT: v_add3_u32 v7, v7, v0, 0x7fff
+; GFX10-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
+; GFX10-NEXT: v_cmp_u_f32_e64 s4, v0, v0
+; GFX10-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e64 v0, v7, v9, s4
+; GFX10-NEXT: v_perm_b32 v5, v5, v0, 0x7060302
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX10-NEXT: v_mov_b32_e32 v4, v3
-; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: s_cbranch_execnz .LBB47_1
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6
+; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5
+; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5
+; GFX10-NEXT: s_cbranch_execnz .LBB69_1
; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX90A-LABEL: flat_agent_atomic_fadd_noret_v2f16__offset12b_neg:
+; GFX90A-LABEL: flat_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
-; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc
-; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
-; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
-; GFX90A-NEXT: flat_load_dword v1, v[0:1]
-; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: .LBB47_1: ; %atomicrmw.start
+; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044
+; GFX90A-NEXT: s_mov_b64 s[6:7], 0
+; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
+; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
+; GFX90A-NEXT: .LBB69_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_pk_add_f16 v0, v1, v2
-; GFX90A-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] glc
+; GFX90A-NEXT: v_mov_b32_e32 v7, v3
+; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7
+; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7
+; GFX90A-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX90A-NEXT: v_add_f32_e32 v5, v5, v2
+; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1
+; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3
+; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8
+; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
+; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5]
+; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9
+; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[6:7] offset:2044 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: v_mov_b32_e32 v1, v0
-; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: s_cbranch_execnz .LBB47_1
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7
+; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
+; GFX90A-NEXT: s_cbranch_execnz .LBB69_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v3
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
-; GFX908-LABEL: flat_agent_atomic_fadd_noret_v2f16__offset12b_neg:
+; GFX908-LABEL: flat_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
-; GFX908-NEXT: v_addc_co_u32_e32 v4, vcc, -1, v1, vcc
-; GFX908-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
-; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
-; GFX908-NEXT: flat_load_dword v1, v[0:1]
-; GFX908-NEXT: s_mov_b64 s[4:5], 0
-; GFX908-NEXT: .LBB47_1: ; %atomicrmw.start
+; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044
+; GFX908-NEXT: s_mov_b64 s[6:7], 0
+; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX908-NEXT: s_movk_i32 s8, 0x7fff
+; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX908-NEXT: s_mov_b32 s9, 0x7060302
+; GFX908-NEXT: .LBB69_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_pk_add_f16 v0, v1, v2
-; GFX908-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
-; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX908-NEXT: v_mov_b32_e32 v6, v3
+; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX908-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX908-NEXT: v_add_f32_e32 v5, v5, v2
+; GFX908-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX908-NEXT: v_bfe_u32 v9, v5, 16, 1
+; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v3
+; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX908-NEXT: v_add3_u32 v7, v7, v3, s8
+; GFX908-NEXT: v_add3_u32 v9, v9, v5, s8
+; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
+; GFX908-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5]
+; GFX908-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX908-NEXT: v_perm_b32 v5, v5, v3, s9
+; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[5:6] offset:2044 glc
+; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT: v_mov_b32_e32 v1, v0
-; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT: s_cbranch_execnz .LBB47_1
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6
+; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7]
+; GFX908-NEXT: s_cbranch_execnz .LBB69_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX908-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX908-NEXT: v_mov_b32_e32 v0, v3
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
-; GFX8-LABEL: flat_agent_atomic_fadd_noret_v2f16__offset12b_neg:
+; GFX8-LABEL: flat_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0xfffff800, v0
-; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc
-; GFX8-NEXT: flat_load_dword v4, v[0:1]
-; GFX8-NEXT: s_mov_b64 s[4:5], 0
-; GFX8-NEXT: .LBB47_1: ; %atomicrmw.start
+; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fc, v0
+; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc
+; GFX8-NEXT: flat_load_dword v0, v[3:4]
+; GFX8-NEXT: s_mov_b64 s[6:7], 0
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2
+; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX8-NEXT: .LBB69_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_add_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_add_f16_e32 v5, v4, v2
-; GFX8-NEXT: v_or_b32_e32 v3, v5, v3
-; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT: v_mov_b32_e32 v6, v0
+; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v6
+; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX8-NEXT: v_add_f32_e32 v0, v0, v1
+; GFX8-NEXT: v_add_f32_e32 v5, v5, v2
+; GFX8-NEXT: v_bfe_u32 v7, v0, 16, 1
+; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v0
+; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7
+; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9
+; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v0
+; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
+; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5]
+; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX8-NEXT: v_alignbit_b32 v5, v5, v0, 16
+; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
-; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT: v_mov_b32_e32 v4, v3
-; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT: s_cbranch_execnz .LBB47_1
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6
+; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7]
+; GFX8-NEXT: s_cbranch_execnz .LBB69_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX7-LABEL: flat_agent_atomic_fadd_noret_v2f16__offset12b_neg:
+; GFX7-LABEL: flat_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0
-; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc
-; GFX7-NEXT: flat_load_dword v5, v[0:1]
-; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3
-; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2
+; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x7fc, v0
+; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
+; GFX7-NEXT: flat_load_dword v0, v[4:5]
+; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v2
+; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v3
+; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v1
+; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v5
-; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v5
-; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3
-; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6
-; GFX7-NEXT: .LBB47_1: ; %atomicrmw.start
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: .LBB69_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5
-; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4
-; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5
-; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4
-; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX7-NEXT: v_add_f32_e32 v6, v6, v2
+; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v1
+; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX7-NEXT: v_add_f32_e32 v7, v7, v3
-; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6
-; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7
-; GFX7-NEXT: v_or_b32_e32 v6, v4, v5
-; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8
-; GFX7-NEXT: v_or_b32_e32 v5, v7, v4
-; GFX7-NEXT: flat_atomic_cmpswap v7, v[0:1], v[5:6] glc
+; GFX7-NEXT: v_add_f32_e32 v6, v6, v2
+; GFX7-NEXT: v_alignbit_b32 v1, v1, v0, 16
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v7
+; GFX7-NEXT: v_alignbit_b32 v0, v0, v6, 16
+; GFX7-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7
-; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7
-; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v0
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT: s_cbranch_execnz .LBB47_1
+; GFX7-NEXT: s_cbranch_execnz .LBB69_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_setpc_b64 s[30:31]
- %gep = getelementptr <2 x half>, ptr %ptr, i64 -512
- %unused = atomicrmw fadd ptr %gep, <2 x half> %val syncscope("agent") seq_cst
- ret void
+ %gep = getelementptr <2 x bfloat>, ptr %ptr, i64 511
+ %result = atomicrmw fadd ptr %gep, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+ ret <2 x bfloat> %result
}
-define <2 x half> @flat_system_atomic_fadd_ret_v2f16__offset12b_pos(ptr %ptr, <2 x half> %val) #0 {
-; GFX12-LABEL: flat_system_atomic_fadd_ret_v2f16__offset12b_pos:
+define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr %ptr, <2 x bfloat> %val) #0 {
+; GFX12-LABEL: flat_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: s_wait_expcnt 0x0
@@ -11471,191 +15863,291 @@ define <2 x half> @flat_system_atomic_fadd_ret_v2f16__offset12b_pos(ptr %ptr, <2
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_pk_add_f16 v0, v[0:1], v2 offset:2044 th:TH_ATOMIC_RETURN
+; GFX12-NEXT: flat_atomic_pk_add_bf16 v0, v[0:1], v2 offset:-2048 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_SYS
+; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
-; GFX940-LABEL: flat_system_atomic_fadd_ret_v2f16__offset12b_pos:
+; GFX940-LABEL: flat_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: buffer_wbl2 sc0 sc1
-; GFX940-NEXT: flat_atomic_pk_add_f16 v0, v[0:1], v2 offset:2044 sc0 sc1
+; GFX940-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
+; GFX940-NEXT: s_nop 1
+; GFX940-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX940-NEXT: buffer_wbl2 sc1
+; GFX940-NEXT: flat_atomic_pk_add_bf16 v0, v[0:1], v2 sc0
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT: buffer_inv sc0 sc1
+; GFX940-NEXT: buffer_inv sc1
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: flat_system_atomic_fadd_ret_v2f16__offset12b_pos:
+; GFX11-LABEL: flat_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2044
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: .LBB48_1: ; %atomicrmw.start
+; GFX11-NEXT: v_mov_b32_e32 v3, v0
+; GFX11-NEXT: s_mov_b32 s1, 0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v3
+; GFX11-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, -1, v1, vcc_lo
+; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v3
+; GFX11-NEXT: flat_load_b32 v0, v[4:5]
+; GFX11-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, -1, v1, vcc_lo
+; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v2
+; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX11-NEXT: .p2align 6
+; GFX11-NEXT: .LBB70_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v4, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_pk_add_f16 v3, v4, v2
+; GFX11-NEXT: v_mov_b32_e32 v6, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX11-NEXT: v_add_f32_e32 v5, v5, v2
+; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v6
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX11-NEXT: v_add_f32_e32 v0, v0, v1
+; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
+; GFX11-NEXT: v_bfe_u32 v7, v0, 16, 1
+; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v0
+; GFX11-NEXT: v_cmp_u_f32_e64 s0, v0, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX11-NEXT: v_add3_u32 v7, v7, v0, 0x7fff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_cndmask_b32_e64 v0, v7, v9, s0
+; GFX11-NEXT: v_perm_b32 v5, v5, v0, 0x7060302
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 glc
+; GFX11-NEXT: flat_atomic_cmpswap_b32 v0, v[3:4], v[5:6] glc
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6
+; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB48_1
+; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX11-NEXT: s_cbranch_execnz .LBB70_1
; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: v_mov_b32_e32 v0, v3
+; GFX11-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
-; GFX10-LABEL: flat_system_atomic_fadd_ret_v2f16__offset12b_pos:
+; GFX10-LABEL: flat_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fc, v0
-; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, 0, v1, vcc_lo
-; GFX10-NEXT: s_mov_b32 s4, 0
+; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
+; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, -1, v1, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v2
+; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX10-NEXT: s_mov_b32 s5, 0
; GFX10-NEXT: flat_load_dword v0, v[3:4]
-; GFX10-NEXT: .LBB48_1: ; %atomicrmw.start
+; GFX10-NEXT: .LBB70_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v1, v0
-; GFX10-NEXT: v_pk_add_f16 v0, v1, v2
+; GFX10-NEXT: v_mov_b32_e32 v6, v0
+; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v6
+; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX10-NEXT: v_add_f32_e32 v0, v0, v1
+; GFX10-NEXT: v_add_f32_e32 v5, v5, v2
+; GFX10-NEXT: v_bfe_u32 v7, v0, 16, 1
+; GFX10-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v0
+; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX10-NEXT: v_add3_u32 v7, v7, v0, 0x7fff
+; GFX10-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
+; GFX10-NEXT: v_cmp_u_f32_e64 s4, v0, v0
+; GFX10-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e64 v0, v7, v9, s4
+; GFX10-NEXT: v_perm_b32 v5, v5, v0, 0x7060302
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
+; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
-; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: s_cbranch_execnz .LBB48_1
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6
+; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5
+; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5
+; GFX10-NEXT: s_cbranch_execnz .LBB70_1
; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX90A-LABEL: flat_system_atomic_fadd_ret_v2f16__offset12b_pos:
+; GFX90A-LABEL: flat_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044
-; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: .LBB48_1: ; %atomicrmw.start
+; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
+; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc
+; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
+; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX90A-NEXT: flat_load_dword v0, v[0:1]
+; GFX90A-NEXT: s_mov_b64 s[6:7], 0
+; GFX90A-NEXT: v_lshlrev_b32_e32 v1, 16, v2
+; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
+; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
+; GFX90A-NEXT: .LBB70_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v5, v3
-; GFX90A-NEXT: v_pk_add_f16 v4, v5, v2
-; GFX90A-NEXT: buffer_wbl2
-; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc
+; GFX90A-NEXT: v_mov_b32_e32 v7, v0
+; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v7
+; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v7
+; GFX90A-NEXT: v_add_f32_e32 v0, v0, v1
+; GFX90A-NEXT: v_add_f32_e32 v3, v3, v2
+; GFX90A-NEXT: v_bfe_u32 v6, v0, 16, 1
+; GFX90A-NEXT: v_bfe_u32 v9, v3, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v0
+; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v3
+; GFX90A-NEXT: v_add3_u32 v6, v6, v0, s8
+; GFX90A-NEXT: v_add3_u32 v9, v9, v3, s8
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
+; GFX90A-NEXT: v_cndmask_b32_e64 v0, v6, v8, s[4:5]
+; GFX90A-NEXT: v_cndmask_b32_e32 v3, v9, v10, vcc
+; GFX90A-NEXT: v_perm_b32 v6, v3, v0, s9
+; GFX90A-NEXT: flat_atomic_cmpswap v0, v[4:5], v[6:7] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: buffer_invl2
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
-; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: s_cbranch_execnz .LBB48_1
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v7
+; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
+; GFX90A-NEXT: s_cbranch_execnz .LBB70_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: v_mov_b32_e32 v0, v3
+; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
-; GFX908-LABEL: flat_system_atomic_fadd_ret_v2f16__offset12b_pos:
+; GFX908-LABEL: flat_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044
-; GFX908-NEXT: s_mov_b64 s[4:5], 0
-; GFX908-NEXT: .LBB48_1: ; %atomicrmw.start
+; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
+; GFX908-NEXT: v_addc_co_u32_e32 v4, vcc, -1, v1, vcc
+; GFX908-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
+; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX908-NEXT: flat_load_dword v0, v[0:1]
+; GFX908-NEXT: s_mov_b64 s[6:7], 0
+; GFX908-NEXT: v_lshlrev_b32_e32 v1, 16, v2
+; GFX908-NEXT: s_movk_i32 s8, 0x7fff
+; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX908-NEXT: s_mov_b32 s9, 0x7060302
+; GFX908-NEXT: .LBB70_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v4, v3
-; GFX908-NEXT: v_pk_add_f16 v3, v4, v2
-; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc
+; GFX908-NEXT: v_mov_b32_e32 v6, v0
+; GFX908-NEXT: v_lshlrev_b32_e32 v0, 16, v6
+; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX908-NEXT: v_add_f32_e32 v0, v0, v1
+; GFX908-NEXT: v_add_f32_e32 v5, v5, v2
+; GFX908-NEXT: v_bfe_u32 v7, v0, 16, 1
+; GFX908-NEXT: v_bfe_u32 v9, v5, 16, 1
+; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v0
+; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX908-NEXT: v_add3_u32 v7, v7, v0, s8
+; GFX908-NEXT: v_add3_u32 v9, v9, v5, s8
+; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
+; GFX908-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5]
+; GFX908-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX908-NEXT: v_perm_b32 v5, v5, v0, s9
+; GFX908-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
-; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT: s_cbranch_execnz .LBB48_1
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6
+; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7]
+; GFX908-NEXT: s_cbranch_execnz .LBB70_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX908-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX908-NEXT: v_mov_b32_e32 v0, v3
+; GFX908-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
-; GFX8-LABEL: flat_system_atomic_fadd_ret_v2f16__offset12b_pos:
+; GFX8-LABEL: flat_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fc, v0
-; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc
+; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0xfffff800, v0
+; GFX8-NEXT: v_addc_u32_e32 v4, vcc, -1, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[3:4]
-; GFX8-NEXT: s_mov_b64 s[4:5], 0
-; GFX8-NEXT: .LBB48_1: ; %atomicrmw.start
+; GFX8-NEXT: s_mov_b64 s[6:7], 0
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2
+; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX8-NEXT: .LBB70_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, v0
-; GFX8-NEXT: v_add_f16_sdwa v0, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_add_f16_e32 v5, v1, v2
-; GFX8-NEXT: v_or_b32_e32 v0, v5, v0
-; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
+; GFX8-NEXT: v_mov_b32_e32 v6, v0
+; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v6
+; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX8-NEXT: v_add_f32_e32 v0, v0, v1
+; GFX8-NEXT: v_add_f32_e32 v5, v5, v2
+; GFX8-NEXT: v_bfe_u32 v7, v0, 16, 1
+; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v0
+; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7
+; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9
+; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v0
+; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
+; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5]
+; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX8-NEXT: v_alignbit_b32 v5, v5, v0, 16
+; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT: s_cbranch_execnz .LBB48_1
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6
+; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7]
+; GFX8-NEXT: s_cbranch_execnz .LBB70_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX7-LABEL: flat_system_atomic_fadd_ret_v2f16__offset12b_pos:
+; GFX7-LABEL: flat_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x7fc, v0
-; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
-; GFX7-NEXT: flat_load_dword v1, v[4:5]
-; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v3
-; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v2
+; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0xfffff800, v0
+; GFX7-NEXT: v_addc_u32_e32 v5, vcc, -1, v1, vcc
+; GFX7-NEXT: flat_load_dword v0, v[4:5]
+; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v2
+; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v0
-; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3
+; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v1
+; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v1
-; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX7-NEXT: .LBB48_1: ; %atomicrmw.start
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: .LBB70_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v1
-; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v0
-; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX7-NEXT: v_add_f32_e32 v6, v6, v2
+; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v1
+; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX7-NEXT: v_add_f32_e32 v7, v7, v3
-; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6
-; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7
-; GFX7-NEXT: v_or_b32_e32 v7, v0, v1
-; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v6
-; GFX7-NEXT: v_or_b32_e32 v6, v8, v0
-; GFX7-NEXT: flat_atomic_cmpswap v6, v[4:5], v[6:7] glc
+; GFX7-NEXT: v_add_f32_e32 v6, v6, v2
+; GFX7-NEXT: v_alignbit_b32 v1, v1, v0, 16
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v7
+; GFX7-NEXT: v_alignbit_b32 v0, v0, v6, 16
+; GFX7-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v6
-; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v6
-; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v7
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v0
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT: s_cbranch_execnz .LBB48_1
+; GFX7-NEXT: s_cbranch_execnz .LBB70_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_setpc_b64 s[30:31]
- %gep = getelementptr <2 x half>, ptr %ptr, i64 511
- %result = atomicrmw fadd ptr %gep, <2 x half> %val seq_cst
- ret <2 x half> %result
+ %gep = getelementptr <2 x bfloat>, ptr %ptr, i64 -512
+ %result = atomicrmw fadd ptr %gep, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+ ret <2 x bfloat> %result
}
-define void @flat_system_atomic_fadd_noret_v2f16__offset12b_pos(ptr %ptr, <2 x half> %val) #0 {
-; GFX12-LABEL: flat_system_atomic_fadd_noret_v2f16__offset12b_pos:
+define void @flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory(ptr %ptr, <2 x bfloat> %val) #0 {
+; GFX12-LABEL: flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: s_wait_expcnt 0x0
@@ -11663,191 +16155,266 @@ define void @flat_system_atomic_fadd_noret_v2f16__offset12b_pos(ptr %ptr, <2 x h
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_pk_add_f16 v[0:1], v2 offset:2044
+; GFX12-NEXT: flat_atomic_pk_add_bf16 v[0:1], v2
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_SYS
+; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
-; GFX940-LABEL: flat_system_atomic_fadd_noret_v2f16__offset12b_pos:
+; GFX940-LABEL: flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: buffer_wbl2 sc0 sc1
-; GFX940-NEXT: flat_atomic_pk_add_f16 v[0:1], v2 offset:2044 sc1
+; GFX940-NEXT: buffer_wbl2 sc1
+; GFX940-NEXT: flat_atomic_pk_add_bf16 v[0:1], v2
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT: buffer_inv sc0 sc1
+; GFX940-NEXT: buffer_inv sc1
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: flat_system_atomic_fadd_noret_v2f16__offset12b_pos:
+; GFX11-LABEL: flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: flat_load_b32 v4, v[0:1] offset:2044
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: .LBB49_1: ; %atomicrmw.start
+; GFX11-NEXT: flat_load_b32 v3, v[0:1]
+; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
+; GFX11-NEXT: s_mov_b32 s1, 0
+; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX11-NEXT: .p2align 6
+; GFX11-NEXT: .LBB71_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_pk_add_f16 v3, v4, v2
+; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_add_f32_e32 v2, v2, v4
+; GFX11-NEXT: v_add_f32_e32 v6, v6, v5
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX11-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX11-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
+; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX11-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:2044 glc
+; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT: v_mov_b32_e32 v4, v3
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-NEXT: v_mov_b32_e32 v3, v2
+; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB49_1
+; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX11-NEXT: s_cbranch_execnz .LBB71_1
; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
-; GFX10-LABEL: flat_system_atomic_fadd_noret_v2f16__offset12b_pos:
+; GFX10-LABEL: flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fc, v0
-; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
-; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: flat_load_dword v4, v[0:1]
-; GFX10-NEXT: .LBB49_1: ; %atomicrmw.start
+; GFX10-NEXT: flat_load_dword v3, v[0:1]
+; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
+; GFX10-NEXT: s_mov_b32 s5, 0
+; GFX10-NEXT: .LBB71_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_pk_add_f16 v3, v4, v2
+; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX10-NEXT: v_add_f32_e32 v2, v2, v4
+; GFX10-NEXT: v_add_f32_e32 v6, v6, v5
+; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX10-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX10-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX10-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
+; GFX10-NEXT: v_cmp_u_f32_e64 s4, v2, v2
+; GFX10-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e64 v2, v7, v9, s4
+; GFX10-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX10-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX10-NEXT: v_mov_b32_e32 v4, v3
-; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: s_cbranch_execnz .LBB49_1
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-NEXT: v_mov_b32_e32 v3, v2
+; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5
+; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5
+; GFX10-NEXT: s_cbranch_execnz .LBB71_1
; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX90A-LABEL: flat_system_atomic_fadd_noret_v2f16__offset12b_pos:
-; GFX90A: ; %bb.0:
-; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: flat_load_dword v5, v[0:1] offset:2044
-; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: .LBB49_1: ; %atomicrmw.start
+; GFX90A-LABEL: flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory:
+; GFX90A: ; %bb.0:
+; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: flat_load_dword v3, v[0:1]
+; GFX90A-NEXT: s_mov_b64 s[6:7], 0
+; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
+; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
+; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
+; GFX90A-NEXT: .LBB71_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_pk_add_f16 v4, v5, v2
-; GFX90A-NEXT: buffer_wbl2
-; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:2044 glc
+; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX90A-NEXT: v_add_f32_e32 v2, v2, v4
+; GFX90A-NEXT: v_add_f32_e32 v6, v6, v5
+; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8
+; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9
+; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: buffer_invl2
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
-; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: v_mov_b32_e32 v5, v3
-; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: s_cbranch_execnz .LBB49_1
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
+; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
+; GFX90A-NEXT: s_cbranch_execnz .LBB71_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
-; GFX908-LABEL: flat_system_atomic_fadd_noret_v2f16__offset12b_pos:
+; GFX908-LABEL: flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: flat_load_dword v4, v[0:1] offset:2044
-; GFX908-NEXT: s_mov_b64 s[4:5], 0
-; GFX908-NEXT: .LBB49_1: ; %atomicrmw.start
+; GFX908-NEXT: flat_load_dword v3, v[0:1]
+; GFX908-NEXT: s_mov_b64 s[6:7], 0
+; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX908-NEXT: s_movk_i32 s8, 0x7fff
+; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
+; GFX908-NEXT: s_mov_b32 s9, 0x7060302
+; GFX908-NEXT: .LBB71_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_pk_add_f16 v3, v4, v2
-; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] offset:2044 glc
+; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX908-NEXT: v_add_f32_e32 v2, v2, v4
+; GFX908-NEXT: v_add_f32_e32 v6, v6, v5
+; GFX908-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX908-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX908-NEXT: v_add3_u32 v7, v7, v2, s8
+; GFX908-NEXT: v_add3_u32 v9, v9, v6, s8
+; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX908-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX908-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX908-NEXT: v_perm_b32 v2, v6, v2, s9
+; GFX908-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
-; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT: v_mov_b32_e32 v4, v3
-; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT: s_cbranch_execnz .LBB49_1
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GFX908-NEXT: v_mov_b32_e32 v3, v2
+; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7]
+; GFX908-NEXT: s_cbranch_execnz .LBB71_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX908-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
-; GFX8-LABEL: flat_system_atomic_fadd_noret_v2f16__offset12b_pos:
+; GFX8-LABEL: flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0
-; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_load_dword v4, v[0:1]
-; GFX8-NEXT: s_mov_b64 s[4:5], 0
-; GFX8-NEXT: .LBB49_1: ; %atomicrmw.start
+; GFX8-NEXT: flat_load_dword v3, v[0:1]
+; GFX8-NEXT: s_mov_b64 s[6:7], 0
+; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
+; GFX8-NEXT: .LBB71_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_add_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_add_f16_e32 v5, v4, v2
-; GFX8-NEXT: v_or_b32_e32 v3, v5, v3
-; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX8-NEXT: v_add_f32_e32 v2, v2, v4
+; GFX8-NEXT: v_add_f32_e32 v6, v6, v5
+; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2
+; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7
+; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9
+; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16
+; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
-; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT: v_mov_b32_e32 v4, v3
-; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT: s_cbranch_execnz .LBB49_1
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GFX8-NEXT: v_mov_b32_e32 v3, v2
+; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7]
+; GFX8-NEXT: s_cbranch_execnz .LBB71_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX7-LABEL: flat_system_atomic_fadd_noret_v2f16__offset12b_pos:
+; GFX7-LABEL: flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7fc, v0
-; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX7-NEXT: flat_load_dword v5, v[0:1]
-; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3
-; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2
+; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v3
+; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v5
-; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v5
-; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3
-; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6
-; GFX7-NEXT: .LBB49_1: ; %atomicrmw.start
-; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5
-; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4
-; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5
-; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4
+; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5
; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX7-NEXT: v_add_f32_e32 v6, v6, v2
+; GFX7-NEXT: .LBB71_1: ; %atomicrmw.start
+; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
+; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
+; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v5
+; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4
; GFX7-NEXT: v_add_f32_e32 v7, v7, v3
-; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6
-; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7
-; GFX7-NEXT: v_or_b32_e32 v6, v4, v5
-; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8
-; GFX7-NEXT: v_or_b32_e32 v5, v7, v4
-; GFX7-NEXT: flat_atomic_cmpswap v7, v[0:1], v[5:6] glc
+; GFX7-NEXT: v_add_f32_e32 v6, v6, v2
+; GFX7-NEXT: v_alignbit_b32 v5, v4, v5, 16
+; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v7
+; GFX7-NEXT: v_alignbit_b32 v4, v4, v6, 16
+; GFX7-NEXT: flat_atomic_cmpswap v6, v[0:1], v[4:5] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7
-; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7
-; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5
+; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v6
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v6
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT: s_cbranch_execnz .LBB49_1
+; GFX7-NEXT: s_cbranch_execnz .LBB71_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_setpc_b64 s[30:31]
- %gep = getelementptr <2 x half>, ptr %ptr, i64 511
- %unused = atomicrmw fadd ptr %gep, <2 x half> %val seq_cst
+ %unused = atomicrmw fadd ptr %ptr, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
ret void
}
-; --------------------------------------------------------------------
-; <2 x bfloat>
-; --------------------------------------------------------------------
-
-define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16(ptr %ptr, <2 x bfloat> %val) #0 {
-; GFX12-LABEL: flat_agent_atomic_fadd_ret_v2bf16:
+define void @flat_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr %ptr, <2 x bfloat> %val) #0 {
+; GFX12-LABEL: flat_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: s_wait_expcnt 0x0
@@ -11855,274 +16422,273 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16(ptr %ptr, <2 x bfloat> %v
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_pk_add_bf16 v0, v[0:1], v2 th:TH_ATOMIC_RETURN
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: flat_atomic_pk_add_bf16 v[0:1], v2 offset:2044
+; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
-; GFX940-LABEL: flat_agent_atomic_fadd_ret_v2bf16:
+; GFX940-LABEL: flat_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: flat_atomic_pk_add_bf16 v0, v[0:1], v2 sc0
+; GFX940-NEXT: flat_atomic_pk_add_bf16 v[0:1], v2 offset:2044
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_inv sc1
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: flat_agent_atomic_fadd_ret_v2bf16:
+; GFX11-LABEL: flat_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: flat_load_b32 v3, v[0:1]
+; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2044
; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX11-NEXT: s_mov_b32 s1, 0
; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB50_1: ; %atomicrmw.start
+; GFX11-NEXT: .LBB72_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v6, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
-; GFX11-NEXT: v_add_f32_e32 v5, v5, v2
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX11-NEXT: v_add_f32_e32 v3, v3, v4
-; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
-; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3
-; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
-; GFX11-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0
-; GFX11-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
+; GFX11-NEXT: v_add_f32_e32 v2, v2, v4
+; GFX11-NEXT: v_add_f32_e32 v6, v6, v5
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX11-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX11-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
+; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX11-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] glc
+; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 glc
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-NEXT: v_mov_b32_e32 v3, v2
; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT: s_cbranch_execnz .LBB50_1
+; GFX11-NEXT: s_cbranch_execnz .LBB72_1
; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_set_inst_prefetch_distance 0x2
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT: v_mov_b32_e32 v0, v3
-; GFX11-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-LABEL: flat_agent_atomic_fadd_ret_v2bf16:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: flat_load_dword v3, v[0:1]
-; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX10-NEXT: s_mov_b32 s5, 0
-; GFX10-NEXT: .LBB50_1: ; %atomicrmw.start
-; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v6, v3
-; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v6
-; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
-; GFX10-NEXT: v_add_f32_e32 v3, v3, v4
-; GFX10-NEXT: v_add_f32_e32 v5, v5, v2
-; GFX10-NEXT: v_bfe_u32 v7, v3, 16, 1
-; GFX10-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v3
-; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v5
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX10-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
-; GFX10-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
-; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3
-; GFX10-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e64 v3, v7, v9, s4
-; GFX10-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
+; GFX11-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: flat_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fc, v0
+; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
+; GFX10-NEXT: s_mov_b32 s5, 0
+; GFX10-NEXT: flat_load_dword v3, v[0:1]
+; GFX10-NEXT: .LBB72_1: ; %atomicrmw.start
+; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX10-NEXT: v_add_f32_e32 v2, v2, v4
+; GFX10-NEXT: v_add_f32_e32 v6, v6, v5
+; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX10-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX10-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX10-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
+; GFX10-NEXT: v_cmp_u_f32_e64 s4, v2, v2
+; GFX10-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e64 v2, v7, v9, s4
+; GFX10-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[5:6] glc
+; GFX10-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-NEXT: v_mov_b32_e32 v3, v2
; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5
-; GFX10-NEXT: s_cbranch_execnz .LBB50_1
+; GFX10-NEXT: s_cbranch_execnz .LBB72_1
; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5
-; GFX10-NEXT: v_mov_b32_e32 v0, v3
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX90A-LABEL: flat_agent_atomic_fadd_ret_v2bf16:
+; GFX90A-LABEL: flat_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: flat_load_dword v3, v[0:1]
+; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
-; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
-; GFX90A-NEXT: .LBB50_1: ; %atomicrmw.start
+; GFX90A-NEXT: .LBB72_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v7, v3
-; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7
-; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7
-; GFX90A-NEXT: v_add_f32_e32 v3, v3, v4
-; GFX90A-NEXT: v_add_f32_e32 v5, v5, v2
-; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5
-; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8
-; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5]
-; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
-; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9
-; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[6:7] glc
+; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX90A-NEXT: v_add_f32_e32 v2, v2, v4
+; GFX90A-NEXT: v_add_f32_e32 v6, v6, v5
+; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8
+; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9
+; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
-; GFX90A-NEXT: s_cbranch_execnz .LBB50_1
+; GFX90A-NEXT: s_cbranch_execnz .LBB72_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX90A-NEXT: v_mov_b32_e32 v0, v3
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
-; GFX908-LABEL: flat_agent_atomic_fadd_ret_v2bf16:
+; GFX908-LABEL: flat_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: flat_load_dword v3, v[0:1]
+; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044
; GFX908-NEXT: s_mov_b64 s[6:7], 0
; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX908-NEXT: s_movk_i32 s8, 0x7fff
-; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX908-NEXT: s_mov_b32 s9, 0x7060302
-; GFX908-NEXT: .LBB50_1: ; %atomicrmw.start
+; GFX908-NEXT: .LBB72_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v6, v3
-; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v6
-; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
-; GFX908-NEXT: v_add_f32_e32 v3, v3, v4
-; GFX908-NEXT: v_add_f32_e32 v5, v5, v2
-; GFX908-NEXT: v_bfe_u32 v7, v3, 16, 1
-; GFX908-NEXT: v_bfe_u32 v9, v5, 16, 1
-; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v5
-; GFX908-NEXT: v_add3_u32 v7, v7, v3, s8
-; GFX908-NEXT: v_add3_u32 v9, v9, v5, s8
-; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX908-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5]
-; GFX908-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
-; GFX908-NEXT: v_perm_b32 v5, v5, v3, s9
-; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[5:6] glc
+; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX908-NEXT: v_add_f32_e32 v2, v2, v4
+; GFX908-NEXT: v_add_f32_e32 v6, v6, v5
+; GFX908-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX908-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX908-NEXT: v_add3_u32 v7, v7, v2, s8
+; GFX908-NEXT: v_add3_u32 v9, v9, v6, s8
+; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX908-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX908-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX908-NEXT: v_perm_b32 v2, v6, v2, s9
+; GFX908-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GFX908-NEXT: v_mov_b32_e32 v3, v2
; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7]
-; GFX908-NEXT: s_cbranch_execnz .LBB50_1
+; GFX908-NEXT: s_cbranch_execnz .LBB72_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX908-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX908-NEXT: v_mov_b32_e32 v0, v3
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
-; GFX8-LABEL: flat_agent_atomic_fadd_ret_v2bf16:
+; GFX8-LABEL: flat_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v3, v[0:1]
; GFX8-NEXT: s_mov_b64 s[6:7], 0
; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX8-NEXT: .LBB50_1: ; %atomicrmw.start
+; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
+; GFX8-NEXT: .LBB72_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v6, v3
-; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v6
-; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
-; GFX8-NEXT: v_add_f32_e32 v3, v3, v4
-; GFX8-NEXT: v_add_f32_e32 v5, v5, v2
-; GFX8-NEXT: v_bfe_u32 v7, v3, 16, 1
-; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1
-; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v3
-; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX8-NEXT: v_add_f32_e32 v2, v2, v4
+; GFX8-NEXT: v_add_f32_e32 v6, v6, v5
+; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2
+; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6
; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7
; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9
-; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5
-; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
-; GFX8-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5]
-; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT: v_alignbit_b32 v5, v5, v3, 16
-; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[5:6] glc
+; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16
+; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GFX8-NEXT: v_mov_b32_e32 v3, v2
; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7]
-; GFX8-NEXT: s_cbranch_execnz .LBB50_1
+; GFX8-NEXT: s_cbranch_execnz .LBB72_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX8-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX8-NEXT: v_mov_b32_e32 v0, v3
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX7-LABEL: flat_agent_atomic_fadd_ret_v2bf16:
+; GFX7-LABEL: flat_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7fc, v0
+; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX7-NEXT: flat_load_dword v5, v[0:1]
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v3
+; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
+; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v5
-; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5
-; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
-; GFX7-NEXT: .LBB50_1: ; %atomicrmw.start
+; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5
+; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX7-NEXT: .LBB72_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v2
-; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
-; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX7-NEXT: v_add_f32_e32 v7, v7, v5
-; GFX7-NEXT: v_add_f32_e32 v6, v6, v4
-; GFX7-NEXT: v_alignbit_b32 v3, v2, v3, 16
-; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v7
-; GFX7-NEXT: v_alignbit_b32 v2, v2, v6, 16
-; GFX7-NEXT: flat_atomic_cmpswap v6, v[0:1], v[2:3] glc
+; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
+; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
+; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v5
+; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX7-NEXT: v_add_f32_e32 v7, v7, v3
+; GFX7-NEXT: v_add_f32_e32 v6, v6, v2
+; GFX7-NEXT: v_alignbit_b32 v5, v4, v5, 16
+; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v7
+; GFX7-NEXT: v_alignbit_b32 v4, v4, v6, 16
+; GFX7-NEXT: flat_atomic_cmpswap v6, v[0:1], v[4:5] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v3
-; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v6
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5
+; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v6
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v6
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT: s_cbranch_execnz .LBB50_1
+; GFX7-NEXT: s_cbranch_execnz .LBB72_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX7-NEXT: v_mov_b32_e32 v0, v3
-; GFX7-NEXT: v_mov_b32_e32 v1, v2
; GFX7-NEXT: s_setpc_b64 s[30:31]
- %result = atomicrmw fadd ptr %ptr, <2 x bfloat> %val syncscope("agent") seq_cst
- ret <2 x bfloat> %result
+ %gep = getelementptr <2 x bfloat>, ptr %ptr, i64 511
+ %unused = atomicrmw fadd ptr %gep, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+ ret void
}
-define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__offset12b_pos(ptr %ptr, <2 x bfloat> %val) #0 {
-; GFX12-LABEL: flat_agent_atomic_fadd_ret_v2bf16__offset12b_pos:
+define void @flat_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr %ptr, <2 x bfloat> %val) #0 {
+; GFX12-LABEL: flat_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: s_wait_expcnt 0x0
@@ -12130,277 +16696,288 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__offset12b_pos(ptr %ptr,
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_pk_add_bf16 v0, v[0:1], v2 offset:2044 th:TH_ATOMIC_RETURN
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: flat_atomic_pk_add_bf16 v[0:1], v2 offset:-2048
+; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
-; GFX940-LABEL: flat_agent_atomic_fadd_ret_v2bf16__offset12b_pos:
+; GFX940-LABEL: flat_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
+; GFX940-NEXT: s_nop 1
+; GFX940-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: flat_atomic_pk_add_bf16 v0, v[0:1], v2 offset:2044 sc0
+; GFX940-NEXT: flat_atomic_pk_add_bf16 v[0:1], v2
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_inv sc1
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: flat_agent_atomic_fadd_ret_v2bf16__offset12b_pos:
+; GFX11-LABEL: flat_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2044
+; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
+; GFX11-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, -1, v1, vcc_lo
+; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
+; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX11-NEXT: flat_load_b32 v3, v[3:4]
; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX11-NEXT: s_mov_b32 s1, 0
; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB51_1: ; %atomicrmw.start
+; GFX11-NEXT: .LBB73_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v6, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
-; GFX11-NEXT: v_add_f32_e32 v5, v5, v2
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX11-NEXT: v_add_f32_e32 v3, v3, v4
-; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
-; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3
-; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
-; GFX11-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0
-; GFX11-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
+; GFX11-NEXT: v_add_f32_e32 v2, v2, v4
+; GFX11-NEXT: v_add_f32_e32 v6, v6, v5
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX11-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX11-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
+; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX11-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:2044 glc
+; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-NEXT: v_mov_b32_e32 v3, v2
; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT: s_cbranch_execnz .LBB51_1
+; GFX11-NEXT: s_cbranch_execnz .LBB73_1
; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX11-NEXT: s_set_inst_prefetch_distance 0x2
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT: v_mov_b32_e32 v0, v3
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
-; GFX10-LABEL: flat_agent_atomic_fadd_ret_v2bf16__offset12b_pos:
+; GFX10-LABEL: flat_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fc, v0
-; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, 0, v1, vcc_lo
-; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v2
-; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
+; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX10-NEXT: s_mov_b32 s5, 0
-; GFX10-NEXT: flat_load_dword v0, v[3:4]
-; GFX10-NEXT: .LBB51_1: ; %atomicrmw.start
+; GFX10-NEXT: flat_load_dword v3, v[0:1]
+; GFX10-NEXT: .LBB73_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v6, v0
-; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v6
-; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
-; GFX10-NEXT: v_add_f32_e32 v0, v0, v1
-; GFX10-NEXT: v_add_f32_e32 v5, v5, v2
-; GFX10-NEXT: v_bfe_u32 v7, v0, 16, 1
-; GFX10-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v0
-; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v5
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX10-NEXT: v_add3_u32 v7, v7, v0, 0x7fff
-; GFX10-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
-; GFX10-NEXT: v_cmp_u_f32_e64 s4, v0, v0
-; GFX10-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e64 v0, v7, v9, s4
-; GFX10-NEXT: v_perm_b32 v5, v5, v0, 0x7060302
+; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX10-NEXT: v_add_f32_e32 v2, v2, v4
+; GFX10-NEXT: v_add_f32_e32 v6, v6, v5
+; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX10-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX10-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX10-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
+; GFX10-NEXT: v_cmp_u_f32_e64 s4, v2, v2
+; GFX10-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e64 v2, v7, v9, s4
+; GFX10-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
+; GFX10-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-NEXT: v_mov_b32_e32 v3, v2
; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5
-; GFX10-NEXT: s_cbranch_execnz .LBB51_1
+; GFX10-NEXT: s_cbranch_execnz .LBB73_1
; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX90A-LABEL: flat_agent_atomic_fadd_ret_v2bf16__offset12b_pos:
+; GFX90A-LABEL: flat_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044
+; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
+; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc
+; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
+; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX90A-NEXT: flat_load_dword v1, v[0:1]
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
-; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v2
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
-; GFX90A-NEXT: .LBB51_1: ; %atomicrmw.start
+; GFX90A-NEXT: .LBB73_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v7, v3
-; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7
-; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7
-; GFX90A-NEXT: v_add_f32_e32 v3, v3, v4
-; GFX90A-NEXT: v_add_f32_e32 v5, v5, v2
-; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5
-; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8
-; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5]
-; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
-; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9
-; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[6:7] offset:2044 glc
+; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v1
+; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v1
+; GFX90A-NEXT: v_add_f32_e32 v0, v0, v3
+; GFX90A-NEXT: v_add_f32_e32 v6, v6, v2
+; GFX90A-NEXT: v_bfe_u32 v7, v0, 16, 1
+; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v0
+; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX90A-NEXT: v_add3_u32 v7, v7, v0, s8
+; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
+; GFX90A-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5]
+; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX90A-NEXT: v_perm_b32 v0, v6, v0, s9
+; GFX90A-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GFX90A-NEXT: v_mov_b32_e32 v1, v0
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
-; GFX90A-NEXT: s_cbranch_execnz .LBB51_1
+; GFX90A-NEXT: s_cbranch_execnz .LBB73_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX90A-NEXT: v_mov_b32_e32 v0, v3
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
-; GFX908-LABEL: flat_agent_atomic_fadd_ret_v2bf16__offset12b_pos:
+; GFX908-LABEL: flat_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044
+; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
+; GFX908-NEXT: v_addc_co_u32_e32 v4, vcc, -1, v1, vcc
+; GFX908-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
+; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX908-NEXT: flat_load_dword v1, v[0:1]
; GFX908-NEXT: s_mov_b64 s[6:7], 0
-; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v2
; GFX908-NEXT: s_movk_i32 s8, 0x7fff
; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX908-NEXT: s_mov_b32 s9, 0x7060302
-; GFX908-NEXT: .LBB51_1: ; %atomicrmw.start
+; GFX908-NEXT: .LBB73_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v6, v3
-; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v6
-; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
-; GFX908-NEXT: v_add_f32_e32 v3, v3, v4
-; GFX908-NEXT: v_add_f32_e32 v5, v5, v2
-; GFX908-NEXT: v_bfe_u32 v7, v3, 16, 1
-; GFX908-NEXT: v_bfe_u32 v9, v5, 16, 1
-; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v5
-; GFX908-NEXT: v_add3_u32 v7, v7, v3, s8
-; GFX908-NEXT: v_add3_u32 v9, v9, v5, s8
-; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX908-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5]
-; GFX908-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
-; GFX908-NEXT: v_perm_b32 v5, v5, v3, s9
-; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[5:6] offset:2044 glc
+; GFX908-NEXT: v_lshlrev_b32_e32 v0, 16, v1
+; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v1
+; GFX908-NEXT: v_add_f32_e32 v0, v0, v5
+; GFX908-NEXT: v_add_f32_e32 v6, v6, v2
+; GFX908-NEXT: v_bfe_u32 v7, v0, 16, 1
+; GFX908-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v0
+; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX908-NEXT: v_add3_u32 v7, v7, v0, s8
+; GFX908-NEXT: v_add3_u32 v9, v9, v6, s8
+; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
+; GFX908-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5]
+; GFX908-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX908-NEXT: v_perm_b32 v0, v6, v0, s9
+; GFX908-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GFX908-NEXT: v_mov_b32_e32 v1, v0
; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7]
-; GFX908-NEXT: s_cbranch_execnz .LBB51_1
+; GFX908-NEXT: s_cbranch_execnz .LBB73_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX908-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX908-NEXT: v_mov_b32_e32 v0, v3
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
-; GFX8-LABEL: flat_agent_atomic_fadd_ret_v2bf16__offset12b_pos:
+; GFX8-LABEL: flat_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fc, v0
-; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_load_dword v0, v[3:4]
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0xfffff800, v0
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc
+; GFX8-NEXT: flat_load_dword v3, v[0:1]
; GFX8-NEXT: s_mov_b64 s[6:7], 0
-; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2
-; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX8-NEXT: .LBB51_1: ; %atomicrmw.start
+; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
+; GFX8-NEXT: .LBB73_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v6, v0
-; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v6
-; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
-; GFX8-NEXT: v_add_f32_e32 v0, v0, v1
-; GFX8-NEXT: v_add_f32_e32 v5, v5, v2
-; GFX8-NEXT: v_bfe_u32 v7, v0, 16, 1
-; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1
-; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v0
-; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX8-NEXT: v_add_f32_e32 v2, v2, v4
+; GFX8-NEXT: v_add_f32_e32 v6, v6, v5
+; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2
+; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6
; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7
; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9
-; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5
-; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v0
-; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
-; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
-; GFX8-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5]
-; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT: v_alignbit_b32 v5, v5, v0, 16
-; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
+; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16
+; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GFX8-NEXT: v_mov_b32_e32 v3, v2
; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7]
-; GFX8-NEXT: s_cbranch_execnz .LBB51_1
+; GFX8-NEXT: s_cbranch_execnz .LBB73_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX8-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX7-LABEL: flat_agent_atomic_fadd_ret_v2bf16__offset12b_pos:
+; GFX7-LABEL: flat_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x7fc, v0
-; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
-; GFX7-NEXT: flat_load_dword v0, v[4:5]
-; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v2
+; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0
+; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc
+; GFX7-NEXT: flat_load_dword v5, v[0:1]
+; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v1
+; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v0
-; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX7-NEXT: .LBB51_1: ; %atomicrmw.start
+; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5
+; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX7-NEXT: .LBB73_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v1
-; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v0
-; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
+; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
+; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v5
+; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4
; GFX7-NEXT: v_add_f32_e32 v7, v7, v3
; GFX7-NEXT: v_add_f32_e32 v6, v6, v2
-; GFX7-NEXT: v_alignbit_b32 v1, v1, v0, 16
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v7
-; GFX7-NEXT: v_alignbit_b32 v0, v0, v6, 16
-; GFX7-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] glc
+; GFX7-NEXT: v_alignbit_b32 v5, v4, v5, 16
+; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v7
+; GFX7-NEXT: v_alignbit_b32 v4, v4, v6, 16
+; GFX7-NEXT: flat_atomic_cmpswap v6, v[0:1], v[4:5] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v0
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5
+; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v6
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v6
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT: s_cbranch_execnz .LBB51_1
+; GFX7-NEXT: s_cbranch_execnz .LBB73_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_setpc_b64 s[30:31]
- %gep = getelementptr <2 x bfloat>, ptr %ptr, i64 511
- %result = atomicrmw fadd ptr %gep, <2 x bfloat> %val syncscope("agent") seq_cst
- ret <2 x bfloat> %result
+ %gep = getelementptr <2 x bfloat>, ptr %ptr, i64 -512
+ %unused = atomicrmw fadd ptr %gep, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+ ret void
}
-define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__offset12b_neg(ptr %ptr, <2 x bfloat> %val) #0 {
-; GFX12-LABEL: flat_agent_atomic_fadd_ret_v2bf16__offset12b_neg:
+define <2 x bfloat> @flat_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr %ptr, <2 x bfloat> %val) #0 {
+; GFX12-LABEL: flat_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: s_wait_expcnt 0x0
@@ -12408,87 +16985,79 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__offset12b_neg(ptr %ptr,
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_pk_add_bf16 v0, v[0:1], v2 offset:-2048 th:TH_ATOMIC_RETURN
+; GFX12-NEXT: flat_atomic_pk_add_bf16 v0, v[0:1], v2 offset:2044 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: global_inv scope:SCOPE_SYS
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
-; GFX940-LABEL: flat_agent_atomic_fadd_ret_v2bf16__offset12b_neg:
+; GFX940-LABEL: flat_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
-; GFX940-NEXT: s_nop 1
-; GFX940-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
-; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: flat_atomic_pk_add_bf16 v0, v[0:1], v2 sc0
+; GFX940-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-NEXT: flat_atomic_pk_add_bf16 v0, v[0:1], v2 offset:2044 sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT: buffer_inv sc1
+; GFX940-NEXT: buffer_inv sc0 sc1
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: flat_agent_atomic_fadd_ret_v2bf16__offset12b_neg:
+; GFX11-LABEL: flat_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v3, v0
-; GFX11-NEXT: s_mov_b32 s1, 0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v3
-; GFX11-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, -1, v1, vcc_lo
-; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v3
-; GFX11-NEXT: flat_load_b32 v0, v[4:5]
-; GFX11-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, -1, v1, vcc_lo
-; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v2
+; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2044
+; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11-NEXT: s_mov_b32 s1, 0
; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB52_1: ; %atomicrmw.start
+; GFX11-NEXT: .LBB74_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v6, v0
+; GFX11-NEXT: v_mov_b32_e32 v6, v3
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
; GFX11-NEXT: v_add_f32_e32 v5, v5, v2
-; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v6
+; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v6
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX11-NEXT: v_add_f32_e32 v0, v0, v1
+; GFX11-NEXT: v_add_f32_e32 v3, v3, v4
; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5
; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
-; GFX11-NEXT: v_bfe_u32 v7, v0, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v0
-; GFX11-NEXT: v_cmp_u_f32_e64 s0, v0, v0
+; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
-; GFX11-NEXT: v_add3_u32 v7, v7, v0, 0x7fff
+; GFX11-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e64 v0, v7, v9, s0
-; GFX11-NEXT: v_perm_b32 v5, v5, v0, 0x7060302
+; GFX11-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0
+; GFX11-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v0, v[3:4], v[5:6] glc
+; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:2044 glc
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT: s_cbranch_execnz .LBB52_1
+; GFX11-NEXT: s_cbranch_execnz .LBB74_1
; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX11-NEXT: s_set_inst_prefetch_distance 0x2
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX11-NEXT: v_mov_b32_e32 v0, v3
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
-; GFX10-LABEL: flat_agent_atomic_fadd_ret_v2bf16__offset12b_neg:
+; GFX10-LABEL: flat_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
-; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, -1, v1, vcc_lo
+; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fc, v0
+; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, 0, v1, vcc_lo
; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v2
; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX10-NEXT: s_mov_b32 s5, 0
; GFX10-NEXT: flat_load_dword v0, v[3:4]
-; GFX10-NEXT: .LBB52_1: ; %atomicrmw.start
+; GFX10-NEXT: .LBB74_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v6, v0
@@ -12515,107 +17084,103 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__offset12b_neg(ptr %ptr,
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6
; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5
-; GFX10-NEXT: s_cbranch_execnz .LBB52_1
+; GFX10-NEXT: s_cbranch_execnz .LBB74_1
; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX90A-LABEL: flat_agent_atomic_fadd_ret_v2bf16__offset12b_neg:
+; GFX90A-LABEL: flat_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
-; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc
-; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
-; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
-; GFX90A-NEXT: flat_load_dword v0, v[0:1]
+; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
-; GFX90A-NEXT: v_lshlrev_b32_e32 v1, 16, v2
+; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
-; GFX90A-NEXT: .LBB52_1: ; %atomicrmw.start
+; GFX90A-NEXT: .LBB74_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v7, v0
-; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v7
-; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v7
-; GFX90A-NEXT: v_add_f32_e32 v0, v0, v1
-; GFX90A-NEXT: v_add_f32_e32 v3, v3, v2
-; GFX90A-NEXT: v_bfe_u32 v6, v0, 16, 1
-; GFX90A-NEXT: v_bfe_u32 v9, v3, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v0
-; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v3
-; GFX90A-NEXT: v_add3_u32 v6, v6, v0, s8
-; GFX90A-NEXT: v_add3_u32 v9, v9, v3, s8
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
-; GFX90A-NEXT: v_cndmask_b32_e64 v0, v6, v8, s[4:5]
-; GFX90A-NEXT: v_cndmask_b32_e32 v3, v9, v10, vcc
-; GFX90A-NEXT: v_perm_b32 v6, v3, v0, s9
-; GFX90A-NEXT: flat_atomic_cmpswap v0, v[4:5], v[6:7] glc
+; GFX90A-NEXT: v_mov_b32_e32 v7, v3
+; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7
+; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7
+; GFX90A-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX90A-NEXT: v_add_f32_e32 v5, v5, v2
+; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1
+; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3
+; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8
+; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
+; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5]
+; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9
+; GFX90A-NEXT: buffer_wbl2
+; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[6:7] offset:2044 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: buffer_invl2
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v7
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
-; GFX90A-NEXT: s_cbranch_execnz .LBB52_1
+; GFX90A-NEXT: s_cbranch_execnz .LBB74_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v3
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
-; GFX908-LABEL: flat_agent_atomic_fadd_ret_v2bf16__offset12b_neg:
+; GFX908-LABEL: flat_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
-; GFX908-NEXT: v_addc_co_u32_e32 v4, vcc, -1, v1, vcc
-; GFX908-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
-; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
-; GFX908-NEXT: flat_load_dword v0, v[0:1]
+; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044
; GFX908-NEXT: s_mov_b64 s[6:7], 0
-; GFX908-NEXT: v_lshlrev_b32_e32 v1, 16, v2
+; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX908-NEXT: s_movk_i32 s8, 0x7fff
; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX908-NEXT: s_mov_b32 s9, 0x7060302
-; GFX908-NEXT: .LBB52_1: ; %atomicrmw.start
+; GFX908-NEXT: .LBB74_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v6, v0
-; GFX908-NEXT: v_lshlrev_b32_e32 v0, 16, v6
+; GFX908-NEXT: v_mov_b32_e32 v6, v3
+; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v6
; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
-; GFX908-NEXT: v_add_f32_e32 v0, v0, v1
+; GFX908-NEXT: v_add_f32_e32 v3, v3, v4
; GFX908-NEXT: v_add_f32_e32 v5, v5, v2
-; GFX908-NEXT: v_bfe_u32 v7, v0, 16, 1
+; GFX908-NEXT: v_bfe_u32 v7, v3, 16, 1
; GFX908-NEXT: v_bfe_u32 v9, v5, 16, 1
-; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v0
+; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v3
; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v5
-; GFX908-NEXT: v_add3_u32 v7, v7, v0, s8
+; GFX908-NEXT: v_add3_u32 v7, v7, v3, s8
; GFX908-NEXT: v_add3_u32 v9, v9, v5, s8
; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
-; GFX908-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5]
+; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
+; GFX908-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5]
; GFX908-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
-; GFX908-NEXT: v_perm_b32 v5, v5, v0, s9
-; GFX908-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
+; GFX908-NEXT: v_perm_b32 v5, v5, v3, s9
+; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[5:6] offset:2044 glc
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6
; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7]
-; GFX908-NEXT: s_cbranch_execnz .LBB52_1
+; GFX908-NEXT: s_cbranch_execnz .LBB74_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX908-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX908-NEXT: v_mov_b32_e32 v0, v3
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
-; GFX8-LABEL: flat_agent_atomic_fadd_ret_v2bf16__offset12b_neg:
+; GFX8-LABEL: flat_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0xfffff800, v0
-; GFX8-NEXT: v_addc_u32_e32 v4, vcc, -1, v1, vcc
+; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fc, v0
+; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v0, v[3:4]
; GFX8-NEXT: s_mov_b64 s[6:7], 0
; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2
; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX8-NEXT: .LBB52_1: ; %atomicrmw.start
+; GFX8-NEXT: .LBB74_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v6, v0
@@ -12643,16 +17208,16 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__offset12b_neg(ptr %ptr,
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6
; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7]
-; GFX8-NEXT: s_cbranch_execnz .LBB52_1
+; GFX8-NEXT: s_cbranch_execnz .LBB74_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX8-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX7-LABEL: flat_agent_atomic_fadd_ret_v2bf16__offset12b_neg:
+; GFX7-LABEL: flat_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0xfffff800, v0
-; GFX7-NEXT: v_addc_u32_e32 v5, vcc, -1, v1, vcc
+; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x7fc, v0
+; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
; GFX7-NEXT: flat_load_dword v0, v[4:5]
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v2
; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
@@ -12662,7 +17227,7 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__offset12b_neg(ptr %ptr,
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v0
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX7-NEXT: .LBB52_1: ; %atomicrmw.start
+; GFX7-NEXT: .LBB74_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
@@ -12682,17 +17247,17 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__offset12b_neg(ptr %ptr,
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT: s_cbranch_execnz .LBB52_1
+; GFX7-NEXT: s_cbranch_execnz .LBB74_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_setpc_b64 s[30:31]
- %gep = getelementptr <2 x bfloat>, ptr %ptr, i64 -512
- %result = atomicrmw fadd ptr %gep, <2 x bfloat> %val syncscope("agent") seq_cst
+ %gep = getelementptr <2 x bfloat>, ptr %ptr, i64 511
+ %result = atomicrmw fadd ptr %gep, <2 x bfloat> %val seq_cst, !amdgpu.no.fine.grained.memory !0
ret <2 x bfloat> %result
}
-define void @flat_agent_atomic_fadd_noret_v2bf16(ptr %ptr, <2 x bfloat> %val) #0 {
-; GFX12-LABEL: flat_agent_atomic_fadd_noret_v2bf16:
+define void @flat_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr %ptr, <2 x bfloat> %val) #0 {
+; GFX12-LABEL: flat_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: s_wait_expcnt 0x0
@@ -12700,30 +17265,30 @@ define void @flat_agent_atomic_fadd_noret_v2bf16(ptr %ptr, <2 x bfloat> %val) #0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_pk_add_bf16 v[0:1], v2
+; GFX12-NEXT: flat_atomic_pk_add_bf16 v[0:1], v2 offset:2044
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: global_inv scope:SCOPE_SYS
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
-; GFX940-LABEL: flat_agent_atomic_fadd_noret_v2bf16:
+; GFX940-LABEL: flat_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: flat_atomic_pk_add_bf16 v[0:1], v2
+; GFX940-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-NEXT: flat_atomic_pk_add_bf16 v[0:1], v2 offset:2044 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT: buffer_inv sc1
+; GFX940-NEXT: buffer_inv sc0 sc1
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: flat_agent_atomic_fadd_noret_v2bf16:
+; GFX11-LABEL: flat_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: flat_load_b32 v3, v[0:1]
+; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2044
; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX11-NEXT: s_mov_b32 s1, 0
; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB53_1: ; %atomicrmw.start
+; GFX11-NEXT: .LBB75_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3
@@ -12746,7 +17311,7 @@ define void @flat_agent_atomic_fadd_noret_v2bf16(ptr %ptr, <2 x bfloat> %val) #0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc
+; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 glc
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
@@ -12755,20 +17320,22 @@ define void @flat_agent_atomic_fadd_noret_v2bf16(ptr %ptr, <2 x bfloat> %val) #0
; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT: s_cbranch_execnz .LBB53_1
+; GFX11-NEXT: s_cbranch_execnz .LBB75_1
; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX11-NEXT: s_set_inst_prefetch_distance 0x2
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
-; GFX10-LABEL: flat_agent_atomic_fadd_noret_v2bf16:
+; GFX10-LABEL: flat_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: flat_load_dword v3, v[0:1]
+; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fc, v0
+; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX10-NEXT: s_mov_b32 s5, 0
-; GFX10-NEXT: .LBB53_1: ; %atomicrmw.start
+; GFX10-NEXT: flat_load_dword v3, v[0:1]
+; GFX10-NEXT: .LBB75_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3
@@ -12795,21 +17362,21 @@ define void @flat_agent_atomic_fadd_noret_v2bf16(ptr %ptr, <2 x bfloat> %val) #0
; GFX10-NEXT: v_mov_b32_e32 v3, v2
; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5
-; GFX10-NEXT: s_cbranch_execnz .LBB53_1
+; GFX10-NEXT: s_cbranch_execnz .LBB75_1
; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX90A-LABEL: flat_agent_atomic_fadd_noret_v2bf16:
+; GFX90A-LABEL: flat_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: flat_load_dword v3, v[0:1]
+; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
-; GFX90A-NEXT: .LBB53_1: ; %atomicrmw.start
+; GFX90A-NEXT: .LBB75_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3
@@ -12827,28 +17394,30 @@ define void @flat_agent_atomic_fadd_noret_v2bf16(ptr %ptr, <2 x bfloat> %val) #0
; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5]
; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9
-; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX90A-NEXT: buffer_wbl2
+; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: buffer_invl2
; GFX90A-NEXT: buffer_wbinvl1
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
-; GFX90A-NEXT: s_cbranch_execnz .LBB53_1
+; GFX90A-NEXT: s_cbranch_execnz .LBB75_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
-; GFX908-LABEL: flat_agent_atomic_fadd_noret_v2bf16:
+; GFX908-LABEL: flat_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: flat_load_dword v3, v[0:1]
+; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044
; GFX908-NEXT: s_mov_b64 s[6:7], 0
; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX908-NEXT: s_movk_i32 s8, 0x7fff
; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX908-NEXT: s_mov_b32 s9, 0x7060302
-; GFX908-NEXT: .LBB53_1: ; %atomicrmw.start
+; GFX908-NEXT: .LBB75_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3
@@ -12866,26 +17435,28 @@ define void @flat_agent_atomic_fadd_noret_v2bf16(ptr %ptr, <2 x bfloat> %val) #0
; GFX908-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5]
; GFX908-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
; GFX908-NEXT: v_perm_b32 v2, v6, v2, s9
-; GFX908-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX908-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX908-NEXT: v_mov_b32_e32 v3, v2
; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7]
-; GFX908-NEXT: s_cbranch_execnz .LBB53_1
+; GFX908-NEXT: s_cbranch_execnz .LBB75_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX908-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
-; GFX8-LABEL: flat_agent_atomic_fadd_noret_v2bf16:
+; GFX8-LABEL: flat_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v3, v[0:1]
; GFX8-NEXT: s_mov_b64 s[6:7], 0
; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
-; GFX8-NEXT: .LBB53_1: ; %atomicrmw.start
+; GFX8-NEXT: .LBB75_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3
@@ -12913,14 +17484,16 @@ define void @flat_agent_atomic_fadd_noret_v2bf16(ptr %ptr, <2 x bfloat> %val) #0
; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX8-NEXT: v_mov_b32_e32 v3, v2
; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7]
-; GFX8-NEXT: s_cbranch_execnz .LBB53_1
+; GFX8-NEXT: s_cbranch_execnz .LBB75_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX8-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX7-LABEL: flat_agent_atomic_fadd_noret_v2bf16:
+; GFX7-LABEL: flat_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7fc, v0
+; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX7-NEXT: flat_load_dword v5, v[0:1]
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
@@ -12930,7 +17503,7 @@ define void @flat_agent_atomic_fadd_noret_v2bf16(ptr %ptr, <2 x bfloat> %val) #0
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5
; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX7-NEXT: .LBB53_1: ; %atomicrmw.start
+; GFX7-NEXT: .LBB75_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
@@ -12950,16 +17523,17 @@ define void @flat_agent_atomic_fadd_noret_v2bf16(ptr %ptr, <2 x bfloat> %val) #0
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v6
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT: s_cbranch_execnz .LBB53_1
+; GFX7-NEXT: s_cbranch_execnz .LBB75_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_setpc_b64 s[30:31]
- %unused = atomicrmw fadd ptr %ptr, <2 x bfloat> %val syncscope("agent") seq_cst
+ %gep = getelementptr <2 x bfloat>, ptr %ptr, i64 511
+ %unused = atomicrmw fadd ptr %gep, <2 x bfloat> %val seq_cst, !amdgpu.no.fine.grained.memory !0
ret void
}
-define void @flat_agent_atomic_fadd_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x bfloat> %val) #0 {
-; GFX12-LABEL: flat_agent_atomic_fadd_noret_v2bf16__offset12b_pos:
+define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memory(ptr %ptr, <2 x bfloat> %val) #0 {
+; GFX12-LABEL: flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memory:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: s_wait_expcnt 0x0
@@ -12967,273 +17541,274 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x b
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_pk_add_bf16 v[0:1], v2 offset:2044
-; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX12-NEXT: flat_atomic_pk_add_bf16 v0, v[0:1], v2 th:TH_ATOMIC_RETURN
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
-; GFX940-LABEL: flat_agent_atomic_fadd_noret_v2bf16__offset12b_pos:
+; GFX940-LABEL: flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memory:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: flat_atomic_pk_add_bf16 v[0:1], v2 offset:2044
+; GFX940-NEXT: flat_atomic_pk_add_bf16 v0, v[0:1], v2 sc0
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_inv sc1
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: flat_agent_atomic_fadd_noret_v2bf16__offset12b_pos:
+; GFX11-LABEL: flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2044
+; GFX11-NEXT: flat_load_b32 v3, v[0:1]
; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
+; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX11-NEXT: s_mov_b32 s1, 0
; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB54_1: ; %atomicrmw.start
+; GFX11-NEXT: .LBB76_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_add_f32_e32 v2, v2, v4
-; GFX11-NEXT: v_add_f32_e32 v6, v6, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX11-NEXT: v_bfe_u32 v8, v6, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2
-; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
-; GFX11-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
-; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2
+; GFX11-NEXT: v_mov_b32_e32 v6, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX11-NEXT: v_add_f32_e32 v5, v5, v2
+; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v6
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
-; GFX11-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
+; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX11-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
+; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX11-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0
+; GFX11-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 glc
+; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] glc
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-NEXT: v_mov_b32_e32 v3, v2
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT: s_cbranch_execnz .LBB54_1
+; GFX11-NEXT: s_cbranch_execnz .LBB76_1
; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX11-NEXT: s_set_inst_prefetch_distance 0x2
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX11-NEXT: v_mov_b32_e32 v0, v3
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
-; GFX10-LABEL: flat_agent_atomic_fadd_noret_v2bf16__offset12b_pos:
+; GFX10-LABEL: flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memory:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fc, v0
-; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX10-NEXT: flat_load_dword v3, v[0:1]
; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
+; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX10-NEXT: s_mov_b32 s5, 0
-; GFX10-NEXT: flat_load_dword v3, v[0:1]
-; GFX10-NEXT: .LBB54_1: ; %atomicrmw.start
+; GFX10-NEXT: .LBB76_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
-; GFX10-NEXT: v_add_f32_e32 v2, v2, v4
-; GFX10-NEXT: v_add_f32_e32 v6, v6, v5
-; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX10-NEXT: v_bfe_u32 v8, v6, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v2
-; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v6
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX10-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
-; GFX10-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
-; GFX10-NEXT: v_cmp_u_f32_e64 s4, v2, v2
-; GFX10-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e64 v2, v7, v9, s4
-; GFX10-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
+; GFX10-NEXT: v_mov_b32_e32 v6, v3
+; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX10-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX10-NEXT: v_add_f32_e32 v5, v5, v2
+; GFX10-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX10-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX10-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX10-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
+; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3
+; GFX10-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e64 v3, v7, v9, s4
+; GFX10-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[5:6] glc
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX10-NEXT: v_mov_b32_e32 v3, v2
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5
-; GFX10-NEXT: s_cbranch_execnz .LBB54_1
+; GFX10-NEXT: s_cbranch_execnz .LBB76_1
; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5
+; GFX10-NEXT: v_mov_b32_e32 v0, v3
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX90A-LABEL: flat_agent_atomic_fadd_noret_v2bf16__offset12b_pos:
+; GFX90A-LABEL: flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044
+; GFX90A-NEXT: flat_load_dword v3, v[0:1]
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
-; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
+; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
-; GFX90A-NEXT: .LBB54_1: ; %atomicrmw.start
+; GFX90A-NEXT: .LBB76_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
-; GFX90A-NEXT: v_add_f32_e32 v2, v2, v4
-; GFX90A-NEXT: v_add_f32_e32 v6, v6, v5
-; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6
-; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8
-; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
-; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
-; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5]
-; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
-; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9
-; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc
+; GFX90A-NEXT: v_mov_b32_e32 v7, v3
+; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7
+; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7
+; GFX90A-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX90A-NEXT: v_add_f32_e32 v5, v5, v2
+; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1
+; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3
+; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8
+; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
+; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5]
+; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9
+; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[6:7] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
-; GFX90A-NEXT: s_cbranch_execnz .LBB54_1
+; GFX90A-NEXT: s_cbranch_execnz .LBB76_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v3
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
-; GFX908-LABEL: flat_agent_atomic_fadd_noret_v2bf16__offset12b_pos:
+; GFX908-LABEL: flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044
+; GFX908-NEXT: flat_load_dword v3, v[0:1]
; GFX908-NEXT: s_mov_b64 s[6:7], 0
; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX908-NEXT: s_movk_i32 s8, 0x7fff
-; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
+; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX908-NEXT: s_mov_b32 s9, 0x7060302
-; GFX908-NEXT: .LBB54_1: ; %atomicrmw.start
+; GFX908-NEXT: .LBB76_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
-; GFX908-NEXT: v_add_f32_e32 v2, v2, v4
-; GFX908-NEXT: v_add_f32_e32 v6, v6, v5
-; GFX908-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX908-NEXT: v_bfe_u32 v9, v6, 16, 1
-; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v6
-; GFX908-NEXT: v_add3_u32 v7, v7, v2, s8
-; GFX908-NEXT: v_add3_u32 v9, v9, v6, s8
-; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
-; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
-; GFX908-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5]
-; GFX908-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
-; GFX908-NEXT: v_perm_b32 v2, v6, v2, s9
-; GFX908-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc
+; GFX908-NEXT: v_mov_b32_e32 v6, v3
+; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX908-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX908-NEXT: v_add_f32_e32 v5, v5, v2
+; GFX908-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX908-NEXT: v_bfe_u32 v9, v5, 16, 1
+; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v3
+; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX908-NEXT: v_add3_u32 v7, v7, v3, s8
+; GFX908-NEXT: v_add3_u32 v9, v9, v5, s8
+; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
+; GFX908-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5]
+; GFX908-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX908-NEXT: v_perm_b32 v5, v5, v3, s9
+; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[5:6] glc
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6
; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX908-NEXT: v_mov_b32_e32 v3, v2
; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7]
-; GFX908-NEXT: s_cbranch_execnz .LBB54_1
+; GFX908-NEXT: s_cbranch_execnz .LBB76_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX908-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX908-NEXT: v_mov_b32_e32 v0, v3
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
-; GFX8-LABEL: flat_agent_atomic_fadd_noret_v2bf16__offset12b_pos:
+; GFX8-LABEL: flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memory:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0
-; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v3, v[0:1]
; GFX8-NEXT: s_mov_b64 s[6:7], 0
; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
-; GFX8-NEXT: .LBB54_1: ; %atomicrmw.start
+; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX8-NEXT: .LBB76_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
-; GFX8-NEXT: v_add_f32_e32 v2, v2, v4
-; GFX8-NEXT: v_add_f32_e32 v6, v6, v5
-; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1
-; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2
-; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6
+; GFX8-NEXT: v_mov_b32_e32 v6, v3
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX8-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX8-NEXT: v_add_f32_e32 v5, v5, v2
+; GFX8-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v3
+; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5
; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7
; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9
-; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6
-; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
-; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
-; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
-; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5]
-; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6
-; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16
-; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v3
+; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
+; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5]
+; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX8-NEXT: v_alignbit_b32 v5, v5, v3, 16
+; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[5:6] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6
; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX8-NEXT: v_mov_b32_e32 v3, v2
; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7]
-; GFX8-NEXT: s_cbranch_execnz .LBB54_1
+; GFX8-NEXT: s_cbranch_execnz .LBB76_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX8-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX8-NEXT: v_mov_b32_e32 v0, v3
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX7-LABEL: flat_agent_atomic_fadd_noret_v2bf16__offset12b_pos:
+; GFX7-LABEL: flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memory:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7fc, v0
-; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX7-NEXT: flat_load_dword v5, v[0:1]
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v3
; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5
-; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX7-NEXT: .LBB54_1: ; %atomicrmw.start
+; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v5
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5
+; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX7-NEXT: .LBB76_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
-; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
-; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v5
-; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; GFX7-NEXT: v_add_f32_e32 v7, v7, v3
-; GFX7-NEXT: v_add_f32_e32 v6, v6, v2
-; GFX7-NEXT: v_alignbit_b32 v5, v4, v5, 16
-; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v7
-; GFX7-NEXT: v_alignbit_b32 v4, v4, v6, 16
-; GFX7-NEXT: flat_atomic_cmpswap v6, v[0:1], v[4:5] glc
+; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v2
+; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX7-NEXT: v_add_f32_e32 v7, v7, v5
+; GFX7-NEXT: v_add_f32_e32 v6, v6, v4
+; GFX7-NEXT: v_alignbit_b32 v3, v2, v3, 16
+; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v7
+; GFX7-NEXT: v_alignbit_b32 v2, v2, v6, 16
+; GFX7-NEXT: flat_atomic_cmpswap v6, v[0:1], v[2:3] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5
-; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v6
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v3
+; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v6
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v6
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v6
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT: s_cbranch_execnz .LBB54_1
+; GFX7-NEXT: s_cbranch_execnz .LBB76_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX7-NEXT: v_mov_b32_e32 v0, v3
+; GFX7-NEXT: v_mov_b32_e32 v1, v2
; GFX7-NEXT: s_setpc_b64 s[30:31]
- %gep = getelementptr <2 x bfloat>, ptr %ptr, i64 511
- %unused = atomicrmw fadd ptr %gep, <2 x bfloat> %val syncscope("agent") seq_cst
- ret void
+ %result = atomicrmw fadd ptr %ptr, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
+ ret <2 x bfloat> %result
}
-define void @flat_agent_atomic_fadd_noret_v2bf16__offset12b_neg(ptr %ptr, <2 x bfloat> %val) #0 {
-; GFX12-LABEL: flat_agent_atomic_fadd_noret_v2bf16__offset12b_neg:
+define void @flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory(ptr %ptr, <2 x bfloat> %val) #0 {
+; GFX12-LABEL: flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: s_wait_expcnt 0x0
@@ -13241,37 +17816,30 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__offset12b_neg(ptr %ptr, <2 x b
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_pk_add_bf16 v[0:1], v2 offset:-2048
+; GFX12-NEXT: flat_atomic_pk_add_bf16 v[0:1], v2
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
-; GFX940-LABEL: flat_agent_atomic_fadd_noret_v2bf16__offset12b_neg:
+; GFX940-LABEL: flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
-; GFX940-NEXT: s_nop 1
-; GFX940-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
; GFX940-NEXT: buffer_wbl2 sc1
; GFX940-NEXT: flat_atomic_pk_add_bf16 v[0:1], v2
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_inv sc1
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: flat_agent_atomic_fadd_noret_v2bf16__offset12b_neg:
+; GFX11-LABEL: flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
-; GFX11-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, -1, v1, vcc_lo
-; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
-; GFX11-NEXT: flat_load_b32 v3, v[3:4]
+; GFX11-NEXT: flat_load_b32 v3, v[0:1]
; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX11-NEXT: s_mov_b32 s1, 0
; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB55_1: ; %atomicrmw.start
+; GFX11-NEXT: .LBB77_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3
@@ -13303,22 +17871,20 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__offset12b_neg(ptr %ptr, <2 x b
; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT: s_cbranch_execnz .LBB55_1
+; GFX11-NEXT: s_cbranch_execnz .LBB77_1
; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX11-NEXT: s_set_inst_prefetch_distance 0x2
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
-; GFX10-LABEL: flat_agent_atomic_fadd_noret_v2bf16__offset12b_neg:
+; GFX10-LABEL: flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
-; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX10-NEXT: flat_load_dword v3, v[0:1]
; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX10-NEXT: s_mov_b32 s5, 0
-; GFX10-NEXT: flat_load_dword v3, v[0:1]
-; GFX10-NEXT: .LBB55_1: ; %atomicrmw.start
+; GFX10-NEXT: .LBB77_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3
@@ -13345,107 +17911,97 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__offset12b_neg(ptr %ptr, <2 x b
; GFX10-NEXT: v_mov_b32_e32 v3, v2
; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5
-; GFX10-NEXT: s_cbranch_execnz .LBB55_1
+; GFX10-NEXT: s_cbranch_execnz .LBB77_1
; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX90A-LABEL: flat_agent_atomic_fadd_noret_v2bf16__offset12b_neg:
+; GFX90A-LABEL: flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
-; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc
-; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
-; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
-; GFX90A-NEXT: flat_load_dword v1, v[0:1]
+; GFX90A-NEXT: flat_load_dword v3, v[0:1]
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
-; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v2
+; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
-; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
-; GFX90A-NEXT: .LBB55_1: ; %atomicrmw.start
+; GFX90A-NEXT: .LBB77_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v1
-; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v1
-; GFX90A-NEXT: v_add_f32_e32 v0, v0, v3
-; GFX90A-NEXT: v_add_f32_e32 v6, v6, v2
-; GFX90A-NEXT: v_bfe_u32 v7, v0, 16, 1
+; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX90A-NEXT: v_add_f32_e32 v2, v2, v4
+; GFX90A-NEXT: v_add_f32_e32 v6, v6, v5
+; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1
; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v0
+; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2
; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6
-; GFX90A-NEXT: v_add3_u32 v7, v7, v0, s8
+; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8
; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8
; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
-; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
-; GFX90A-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5]
+; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5]
; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
-; GFX90A-NEXT: v_perm_b32 v0, v6, v0, s9
-; GFX90A-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] glc
+; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9
+; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX90A-NEXT: v_mov_b32_e32 v1, v0
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
-; GFX90A-NEXT: s_cbranch_execnz .LBB55_1
+; GFX90A-NEXT: s_cbranch_execnz .LBB77_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
-; GFX908-LABEL: flat_agent_atomic_fadd_noret_v2bf16__offset12b_neg:
+; GFX908-LABEL: flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
-; GFX908-NEXT: v_addc_co_u32_e32 v4, vcc, -1, v1, vcc
-; GFX908-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
-; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
-; GFX908-NEXT: flat_load_dword v1, v[0:1]
+; GFX908-NEXT: flat_load_dword v3, v[0:1]
; GFX908-NEXT: s_mov_b64 s[6:7], 0
-; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v2
+; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX908-NEXT: s_movk_i32 s8, 0x7fff
-; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX908-NEXT: s_mov_b32 s9, 0x7060302
-; GFX908-NEXT: .LBB55_1: ; %atomicrmw.start
+; GFX908-NEXT: .LBB77_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_lshlrev_b32_e32 v0, 16, v1
-; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v1
-; GFX908-NEXT: v_add_f32_e32 v0, v0, v5
-; GFX908-NEXT: v_add_f32_e32 v6, v6, v2
-; GFX908-NEXT: v_bfe_u32 v7, v0, 16, 1
+; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX908-NEXT: v_add_f32_e32 v2, v2, v4
+; GFX908-NEXT: v_add_f32_e32 v6, v6, v5
+; GFX908-NEXT: v_bfe_u32 v7, v2, 16, 1
; GFX908-NEXT: v_bfe_u32 v9, v6, 16, 1
-; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v0
+; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v2
; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v6
-; GFX908-NEXT: v_add3_u32 v7, v7, v0, s8
+; GFX908-NEXT: v_add3_u32 v7, v7, v2, s8
; GFX908-NEXT: v_add3_u32 v9, v9, v6, s8
; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
-; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
-; GFX908-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5]
+; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX908-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5]
; GFX908-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
-; GFX908-NEXT: v_perm_b32 v0, v6, v0, s9
-; GFX908-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
+; GFX908-NEXT: v_perm_b32 v2, v6, v2, s9
+; GFX908-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX908-NEXT: v_mov_b32_e32 v1, v0
+; GFX908-NEXT: v_mov_b32_e32 v3, v2
; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7]
-; GFX908-NEXT: s_cbranch_execnz .LBB55_1
+; GFX908-NEXT: s_cbranch_execnz .LBB77_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX908-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
-; GFX8-LABEL: flat_agent_atomic_fadd_noret_v2bf16__offset12b_neg:
+; GFX8-LABEL: flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0xfffff800, v0
-; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc
; GFX8-NEXT: flat_load_dword v3, v[0:1]
; GFX8-NEXT: s_mov_b64 s[6:7], 0
; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
-; GFX8-NEXT: .LBB55_1: ; %atomicrmw.start
+; GFX8-NEXT: .LBB77_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3
@@ -13473,16 +18029,14 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__offset12b_neg(ptr %ptr, <2 x b
; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX8-NEXT: v_mov_b32_e32 v3, v2
; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7]
-; GFX8-NEXT: s_cbranch_execnz .LBB55_1
+; GFX8-NEXT: s_cbranch_execnz .LBB77_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX8-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX7-LABEL: flat_agent_atomic_fadd_noret_v2bf16__offset12b_neg:
+; GFX7-LABEL: flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0
-; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc
; GFX7-NEXT: flat_load_dword v5, v[0:1]
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
@@ -13492,7 +18046,7 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__offset12b_neg(ptr %ptr, <2 x b
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5
; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX7-NEXT: .LBB55_1: ; %atomicrmw.start
+; GFX7-NEXT: .LBB77_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
@@ -13512,17 +18066,16 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__offset12b_neg(ptr %ptr, <2 x b
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v6
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT: s_cbranch_execnz .LBB55_1
+; GFX7-NEXT: s_cbranch_execnz .LBB77_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_setpc_b64 s[30:31]
- %gep = getelementptr <2 x bfloat>, ptr %ptr, i64 -512
- %unused = atomicrmw fadd ptr %gep, <2 x bfloat> %val syncscope("agent") seq_cst
+ %unused = atomicrmw fadd ptr %ptr, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
ret void
}
-define <2 x bfloat> @flat_system_atomic_fadd_ret_v2bf16__offset12b_pos(ptr %ptr, <2 x bfloat> %val) #0 {
-; GFX12-LABEL: flat_system_atomic_fadd_ret_v2bf16__offset12b_pos:
+define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr %ptr, <2 x bfloat> %val) #0 {
+; GFX12-LABEL: flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: s_wait_expcnt 0x0
@@ -13530,30 +18083,30 @@ define <2 x bfloat> @flat_system_atomic_fadd_ret_v2bf16__offset12b_pos(ptr %ptr,
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_pk_add_bf16 v0, v[0:1], v2 offset:2044 th:TH_ATOMIC_RETURN
+; GFX12-NEXT: flat_atomic_pk_add_bf16 v0, v[0:1], v2 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_SYS
+; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
-; GFX940-LABEL: flat_system_atomic_fadd_ret_v2bf16__offset12b_pos:
+; GFX940-LABEL: flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: buffer_wbl2 sc0 sc1
-; GFX940-NEXT: flat_atomic_pk_add_bf16 v0, v[0:1], v2 offset:2044 sc0 sc1
+; GFX940-NEXT: buffer_wbl2 sc1
+; GFX940-NEXT: flat_atomic_pk_add_bf16 v0, v[0:1], v2 sc0
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT: buffer_inv sc0 sc1
+; GFX940-NEXT: buffer_inv sc1
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: flat_system_atomic_fadd_ret_v2bf16__offset12b_pos:
+; GFX11-LABEL: flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2044
+; GFX11-NEXT: flat_load_b32 v3, v[0:1]
; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX11-NEXT: s_mov_b32 s1, 0
; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB56_1: ; %atomicrmw.start
+; GFX11-NEXT: .LBB78_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v6, v3
@@ -13578,7 +18131,7 @@ define <2 x bfloat> @flat_system_atomic_fadd_ret_v2bf16__offset12b_pos(ptr %ptr,
; GFX11-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0
; GFX11-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:2044 glc
+; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] glc
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
@@ -13586,64 +18139,63 @@ define <2 x bfloat> @flat_system_atomic_fadd_ret_v2bf16__offset12b_pos(ptr %ptr,
; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT: s_cbranch_execnz .LBB56_1
+; GFX11-NEXT: s_cbranch_execnz .LBB78_1
; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX11-NEXT: s_set_inst_prefetch_distance 0x2
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1
; GFX11-NEXT: v_mov_b32_e32 v0, v3
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
-; GFX10-LABEL: flat_system_atomic_fadd_ret_v2bf16__offset12b_pos:
+; GFX10-LABEL: flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fc, v0
-; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, 0, v1, vcc_lo
-; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v2
+; GFX10-NEXT: flat_load_dword v3, v[0:1]
+; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX10-NEXT: s_mov_b32 s5, 0
-; GFX10-NEXT: flat_load_dword v0, v[3:4]
-; GFX10-NEXT: .LBB56_1: ; %atomicrmw.start
+; GFX10-NEXT: .LBB78_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v6, v0
-; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v6
+; GFX10-NEXT: v_mov_b32_e32 v6, v3
+; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v6
; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
-; GFX10-NEXT: v_add_f32_e32 v0, v0, v1
+; GFX10-NEXT: v_add_f32_e32 v3, v3, v4
; GFX10-NEXT: v_add_f32_e32 v5, v5, v2
-; GFX10-NEXT: v_bfe_u32 v7, v0, 16, 1
+; GFX10-NEXT: v_bfe_u32 v7, v3, 16, 1
; GFX10-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v0
+; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v3
; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v5
; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX10-NEXT: v_add3_u32 v7, v7, v0, 0x7fff
+; GFX10-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
; GFX10-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
-; GFX10-NEXT: v_cmp_u_f32_e64 s4, v0, v0
+; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3
; GFX10-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e64 v0, v7, v9, s4
-; GFX10-NEXT: v_perm_b32 v5, v5, v0, 0x7060302
+; GFX10-NEXT: v_cndmask_b32_e64 v3, v7, v9, s4
+; GFX10-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
+; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[5:6] glc
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5
-; GFX10-NEXT: s_cbranch_execnz .LBB56_1
+; GFX10-NEXT: s_cbranch_execnz .LBB78_1
; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5
+; GFX10-NEXT: v_mov_b32_e32 v0, v3
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX90A-LABEL: flat_system_atomic_fadd_ret_v2bf16__offset12b_pos:
+; GFX90A-LABEL: flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044
+; GFX90A-NEXT: flat_load_dword v3, v[0:1]
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
-; GFX90A-NEXT: .LBB56_1: ; %atomicrmw.start
+; GFX90A-NEXT: .LBB78_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v7, v3
@@ -13662,30 +18214,28 @@ define <2 x bfloat> @flat_system_atomic_fadd_ret_v2bf16__offset12b_pos(ptr %ptr,
; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5]
; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9
-; GFX90A-NEXT: buffer_wbl2
-; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[6:7] offset:2044 glc
+; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[6:7] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: buffer_invl2
; GFX90A-NEXT: buffer_wbinvl1
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
-; GFX90A-NEXT: s_cbranch_execnz .LBB56_1
+; GFX90A-NEXT: s_cbranch_execnz .LBB78_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX90A-NEXT: v_mov_b32_e32 v0, v3
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
-; GFX908-LABEL: flat_system_atomic_fadd_ret_v2bf16__offset12b_pos:
+; GFX908-LABEL: flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044
+; GFX908-NEXT: flat_load_dword v3, v[0:1]
; GFX908-NEXT: s_mov_b64 s[6:7], 0
; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX908-NEXT: s_movk_i32 s8, 0x7fff
; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX908-NEXT: s_mov_b32 s9, 0x7060302
-; GFX908-NEXT: .LBB56_1: ; %atomicrmw.start
+; GFX908-NEXT: .LBB78_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX908-NEXT: v_mov_b32_e32 v6, v3
@@ -13704,105 +18254,103 @@ define <2 x bfloat> @flat_system_atomic_fadd_ret_v2bf16__offset12b_pos(ptr %ptr,
; GFX908-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5]
; GFX908-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
; GFX908-NEXT: v_perm_b32 v5, v5, v3, s9
-; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[5:6] offset:2044 glc
+; GFX908-NEXT: flat_atomic_cmpswap v3, v[0:1], v[5:6] glc
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6
; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7]
-; GFX908-NEXT: s_cbranch_execnz .LBB56_1
+; GFX908-NEXT: s_cbranch_execnz .LBB78_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX908-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX908-NEXT: v_mov_b32_e32 v0, v3
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
-; GFX8-LABEL: flat_system_atomic_fadd_ret_v2bf16__offset12b_pos:
+; GFX8-LABEL: flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fc, v0
-; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_load_dword v0, v[3:4]
+; GFX8-NEXT: flat_load_dword v3, v[0:1]
; GFX8-NEXT: s_mov_b64 s[6:7], 0
-; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX8-NEXT: .LBB56_1: ; %atomicrmw.start
+; GFX8-NEXT: .LBB78_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v6, v0
-; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v6
+; GFX8-NEXT: v_mov_b32_e32 v6, v3
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v6
; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
-; GFX8-NEXT: v_add_f32_e32 v0, v0, v1
+; GFX8-NEXT: v_add_f32_e32 v3, v3, v4
; GFX8-NEXT: v_add_f32_e32 v5, v5, v2
-; GFX8-NEXT: v_bfe_u32 v7, v0, 16, 1
+; GFX8-NEXT: v_bfe_u32 v7, v3, 16, 1
; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1
-; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v0
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v3
; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5
; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7
; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9
; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v0
-; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
+; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v3
+; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
-; GFX8-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5]
+; GFX8-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5]
; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT: v_alignbit_b32 v5, v5, v0, 16
-; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
+; GFX8-NEXT: v_alignbit_b32 v5, v5, v3, 16
+; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[5:6] glc
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6
; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7]
-; GFX8-NEXT: s_cbranch_execnz .LBB56_1
+; GFX8-NEXT: s_cbranch_execnz .LBB78_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX8-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX8-NEXT: v_mov_b32_e32 v0, v3
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX7-LABEL: flat_system_atomic_fadd_ret_v2bf16__offset12b_pos:
+; GFX7-LABEL: flat_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x7fc, v0
-; GFX7-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
-; GFX7-NEXT: flat_load_dword v0, v[4:5]
-; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v2
-; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GFX7-NEXT: flat_load_dword v5, v[0:1]
+; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v3
; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v1
-; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v0
-; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX7-NEXT: .LBB56_1: ; %atomicrmw.start
+; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v5
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5
+; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX7-NEXT: .LBB78_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v1
-; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v0
-; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX7-NEXT: v_add_f32_e32 v7, v7, v3
-; GFX7-NEXT: v_add_f32_e32 v6, v6, v2
-; GFX7-NEXT: v_alignbit_b32 v1, v1, v0, 16
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v7
-; GFX7-NEXT: v_alignbit_b32 v0, v0, v6, 16
-; GFX7-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] glc
+; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v2
+; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX7-NEXT: v_add_f32_e32 v7, v7, v5
+; GFX7-NEXT: v_add_f32_e32 v6, v6, v4
+; GFX7-NEXT: v_alignbit_b32 v3, v2, v3, 16
+; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v7
+; GFX7-NEXT: v_alignbit_b32 v2, v2, v6, 16
+; GFX7-NEXT: flat_atomic_cmpswap v6, v[0:1], v[2:3] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v0
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v3
+; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v6
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v6
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT: s_cbranch_execnz .LBB56_1
+; GFX7-NEXT: s_cbranch_execnz .LBB78_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX7-NEXT: v_mov_b32_e32 v0, v3
+; GFX7-NEXT: v_mov_b32_e32 v1, v2
; GFX7-NEXT: s_setpc_b64 s[30:31]
- %gep = getelementptr <2 x bfloat>, ptr %ptr, i64 511
- %result = atomicrmw fadd ptr %gep, <2 x bfloat> %val seq_cst
+ %result = atomicrmw fadd ptr %ptr, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
ret <2 x bfloat> %result
}
-define void @flat_system_atomic_fadd_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x bfloat> %val) #0 {
-; GFX12-LABEL: flat_system_atomic_fadd_noret_v2bf16__offset12b_pos:
+define void @flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr %ptr, <2 x bfloat> %val) #0 {
+; GFX12-LABEL: flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: s_wait_expcnt 0x0
@@ -13810,30 +18358,30 @@ define void @flat_system_atomic_fadd_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: flat_atomic_pk_add_bf16 v[0:1], v2 offset:2044
+; GFX12-NEXT: flat_atomic_pk_add_bf16 v[0:1], v2
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_SYS
+; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
-; GFX940-LABEL: flat_system_atomic_fadd_noret_v2bf16__offset12b_pos:
+; GFX940-LABEL: flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: buffer_wbl2 sc0 sc1
-; GFX940-NEXT: flat_atomic_pk_add_bf16 v[0:1], v2 offset:2044 sc1
+; GFX940-NEXT: buffer_wbl2 sc1
+; GFX940-NEXT: flat_atomic_pk_add_bf16 v[0:1], v2
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT: buffer_inv sc0 sc1
+; GFX940-NEXT: buffer_inv sc1
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: flat_system_atomic_fadd_noret_v2bf16__offset12b_pos:
+; GFX11-LABEL: flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:2044
+; GFX11-NEXT: flat_load_b32 v3, v[0:1]
; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX11-NEXT: s_mov_b32 s1, 0
; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB57_1: ; %atomicrmw.start
+; GFX11-NEXT: .LBB79_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3
@@ -13856,7 +18404,7 @@ define void @flat_system_atomic_fadd_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:2044 glc
+; GFX11-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
@@ -13865,22 +18413,20 @@ define void @flat_system_atomic_fadd_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x
; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT: s_cbranch_execnz .LBB57_1
+; GFX11-NEXT: s_cbranch_execnz .LBB79_1
; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX11-NEXT: s_set_inst_prefetch_distance 0x2
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
-; GFX10-LABEL: flat_system_atomic_fadd_noret_v2bf16__offset12b_pos:
+; GFX10-LABEL: flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x7fc, v0
-; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX10-NEXT: flat_load_dword v3, v[0:1]
; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX10-NEXT: s_mov_b32 s5, 0
-; GFX10-NEXT: flat_load_dword v3, v[0:1]
-; GFX10-NEXT: .LBB57_1: ; %atomicrmw.start
+; GFX10-NEXT: .LBB79_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3
@@ -13907,21 +18453,21 @@ define void @flat_system_atomic_fadd_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x
; GFX10-NEXT: v_mov_b32_e32 v3, v2
; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5
-; GFX10-NEXT: s_cbranch_execnz .LBB57_1
+; GFX10-NEXT: s_cbranch_execnz .LBB79_1
; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX90A-LABEL: flat_system_atomic_fadd_noret_v2bf16__offset12b_pos:
+; GFX90A-LABEL: flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: flat_load_dword v3, v[0:1] offset:2044
+; GFX90A-NEXT: flat_load_dword v3, v[0:1]
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
-; GFX90A-NEXT: .LBB57_1: ; %atomicrmw.start
+; GFX90A-NEXT: .LBB79_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3
@@ -13939,30 +18485,28 @@ define void @flat_system_atomic_fadd_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x
; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5]
; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9
-; GFX90A-NEXT: buffer_wbl2
-; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc
+; GFX90A-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: buffer_invl2
; GFX90A-NEXT: buffer_wbinvl1
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
-; GFX90A-NEXT: s_cbranch_execnz .LBB57_1
+; GFX90A-NEXT: s_cbranch_execnz .LBB79_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
-; GFX908-LABEL: flat_system_atomic_fadd_noret_v2bf16__offset12b_pos:
+; GFX908-LABEL: flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: flat_load_dword v3, v[0:1] offset:2044
+; GFX908-NEXT: flat_load_dword v3, v[0:1]
; GFX908-NEXT: s_mov_b64 s[6:7], 0
; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX908-NEXT: s_movk_i32 s8, 0x7fff
; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX908-NEXT: s_mov_b32 s9, 0x7060302
-; GFX908-NEXT: .LBB57_1: ; %atomicrmw.start
+; GFX908-NEXT: .LBB79_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3
@@ -13980,28 +18524,26 @@ define void @flat_system_atomic_fadd_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x
; GFX908-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5]
; GFX908-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
; GFX908-NEXT: v_perm_b32 v2, v6, v2, s9
-; GFX908-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:2044 glc
+; GFX908-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX908-NEXT: v_mov_b32_e32 v3, v2
; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7]
-; GFX908-NEXT: s_cbranch_execnz .LBB57_1
+; GFX908-NEXT: s_cbranch_execnz .LBB79_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX908-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
-; GFX8-LABEL: flat_system_atomic_fadd_noret_v2bf16__offset12b_pos:
+; GFX8-LABEL: flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0
-; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v3, v[0:1]
; GFX8-NEXT: s_mov_b64 s[6:7], 0
; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
-; GFX8-NEXT: .LBB57_1: ; %atomicrmw.start
+; GFX8-NEXT: .LBB79_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3
@@ -14029,16 +18571,14 @@ define void @flat_system_atomic_fadd_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x
; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX8-NEXT: v_mov_b32_e32 v3, v2
; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7]
-; GFX8-NEXT: s_cbranch_execnz .LBB57_1
+; GFX8-NEXT: s_cbranch_execnz .LBB79_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX8-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX7-LABEL: flat_system_atomic_fadd_noret_v2bf16__offset12b_pos:
+; GFX7-LABEL: flat_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7fc, v0
-; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX7-NEXT: flat_load_dword v5, v[0:1]
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
@@ -14048,7 +18588,7 @@ define void @flat_system_atomic_fadd_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5
; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX7-NEXT: .LBB57_1: ; %atomicrmw.start
+; GFX7-NEXT: .LBB79_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
@@ -14068,14 +18608,15 @@ define void @flat_system_atomic_fadd_noret_v2bf16__offset12b_pos(ptr %ptr, <2 x
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v6
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT: s_cbranch_execnz .LBB57_1
+; GFX7-NEXT: s_cbranch_execnz .LBB79_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_setpc_b64 s[30:31]
- %gep = getelementptr <2 x bfloat>, ptr %ptr, i64 511
- %unused = atomicrmw fadd ptr %gep, <2 x bfloat> %val seq_cst
+ %unused = atomicrmw fadd ptr %ptr, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
ret void
}
attributes #0 = { nounwind "amdgpu-unsafe-fp-atomics"="true" }
attributes #1 = { nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" "amdgpu-unsafe-fp-atomics"="true" }
+
+!0 = !{}
diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll
index 77c8e034e68a0..62762008b0c23 100644
--- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll
@@ -13,8 +13,8 @@
; float
; --------------------------------------------------------------------
-define float @global_agent_atomic_fadd_ret_f32(ptr addrspace(1) %ptr, float %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fadd_ret_f32:
+define float @global_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: s_wait_expcnt 0x0
@@ -27,7 +27,7 @@ define float @global_agent_atomic_fadd_ret_f32(ptr addrspace(1) %ptr, float %val
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
-; GFX940-LABEL: global_agent_atomic_fadd_ret_f32:
+; GFX940-LABEL: global_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_wbl2 sc1
@@ -36,7 +36,7 @@ define float @global_agent_atomic_fadd_ret_f32(ptr addrspace(1) %ptr, float %val
; GFX940-NEXT: buffer_inv sc1
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: global_agent_atomic_fadd_ret_f32:
+; GFX11-LABEL: global_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
@@ -46,7 +46,7 @@ define float @global_agent_atomic_fadd_ret_f32(ptr addrspace(1) %ptr, float %val
; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
-; GFX10-LABEL: global_agent_atomic_fadd_ret_f32:
+; GFX10-LABEL: global_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: global_load_dword v3, v[0:1], off
@@ -70,7 +70,7 @@ define float @global_agent_atomic_fadd_ret_f32(ptr addrspace(1) %ptr, float %val
; GFX10-NEXT: v_mov_b32_e32 v0, v3
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX90A-LABEL: global_agent_atomic_fadd_ret_f32:
+; GFX90A-LABEL: global_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off glc
@@ -78,7 +78,7 @@ define float @global_agent_atomic_fadd_ret_f32(ptr addrspace(1) %ptr, float %val
; GFX90A-NEXT: buffer_wbinvl1
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
-; GFX908-LABEL: global_agent_atomic_fadd_ret_f32:
+; GFX908-LABEL: global_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: global_load_dword v3, v[0:1], off
@@ -100,7 +100,7 @@ define float @global_agent_atomic_fadd_ret_f32(ptr addrspace(1) %ptr, float %val
; GFX908-NEXT: v_mov_b32_e32 v0, v3
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
-; GFX8-LABEL: global_agent_atomic_fadd_ret_f32:
+; GFX8-LABEL: global_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: flat_load_dword v3, v[0:1]
@@ -122,7 +122,7 @@ define float @global_agent_atomic_fadd_ret_f32(ptr addrspace(1) %ptr, float %val
; GFX8-NEXT: v_mov_b32_e32 v0, v3
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX7-LABEL: global_agent_atomic_fadd_ret_f32:
+; GFX7-LABEL: global_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: s_mov_b32 s6, 0
@@ -150,7 +150,7 @@ define float @global_agent_atomic_fadd_ret_f32(ptr addrspace(1) %ptr, float %val
; GFX7-NEXT: v_mov_b32_e32 v0, v3
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
-; GFX6-LABEL: global_agent_atomic_fadd_ret_f32:
+; GFX6-LABEL: global_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: s_mov_b32 s6, 0
@@ -179,12 +179,12 @@ define float @global_agent_atomic_fadd_ret_f32(ptr addrspace(1) %ptr, float %val
; GFX6-NEXT: v_mov_b32_e32 v0, v3
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
- %result = atomicrmw fadd ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst
+ %result = atomicrmw fadd ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
ret float %result
}
-define float @global_agent_atomic_fadd_ret_f32__offset12b_pos(ptr addrspace(1) %ptr, float %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos:
+define float @global_agent_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: s_wait_expcnt 0x0
@@ -197,7 +197,7 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_pos(ptr addrspace(1) %
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
-; GFX940-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos:
+; GFX940-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_wbl2 sc1
@@ -206,7 +206,7 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_pos(ptr addrspace(1) %
; GFX940-NEXT: buffer_inv sc1
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos:
+; GFX11-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
@@ -216,7 +216,7 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_pos(ptr addrspace(1) %
; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
-; GFX10-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos:
+; GFX10-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044
@@ -240,7 +240,7 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_pos(ptr addrspace(1) %
; GFX10-NEXT: v_mov_b32_e32 v0, v3
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX90A-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos:
+; GFX90A-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:2044 glc
@@ -248,7 +248,7 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_pos(ptr addrspace(1) %
; GFX90A-NEXT: buffer_wbinvl1
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
-; GFX908-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos:
+; GFX908-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044
@@ -270,7 +270,7 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_pos(ptr addrspace(1) %
; GFX908-NEXT: v_mov_b32_e32 v0, v3
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
-; GFX8-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos:
+; GFX8-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fc, v0
@@ -293,7 +293,7 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_pos(ptr addrspace(1) %
; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX7-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos:
+; GFX7-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: s_mov_b32 s6, 0
@@ -321,7 +321,7 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_pos(ptr addrspace(1) %
; GFX7-NEXT: v_mov_b32_e32 v0, v3
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
-; GFX6-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos:
+; GFX6-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: s_mov_b32 s6, 0
@@ -351,12 +351,12 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_pos(ptr addrspace(1) %
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr float, ptr addrspace(1) %ptr, i64 511
- %result = atomicrmw fadd ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst
+ %result = atomicrmw fadd ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
ret float %result
}
-define float @global_agent_atomic_fadd_ret_f32__offset12b_neg(ptr addrspace(1) %ptr, float %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_neg:
+define float @global_agent_atomic_fadd_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: s_wait_expcnt 0x0
@@ -369,7 +369,7 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_neg(ptr addrspace(1) %
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
-; GFX940-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_neg:
+; GFX940-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_wbl2 sc1
@@ -378,7 +378,7 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_neg(ptr addrspace(1) %
; GFX940-NEXT: buffer_inv sc1
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_neg:
+; GFX11-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
@@ -388,7 +388,7 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_neg(ptr addrspace(1) %
; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
-; GFX10-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_neg:
+; GFX10-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:-2048
@@ -412,7 +412,7 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_neg(ptr addrspace(1) %
; GFX10-NEXT: v_mov_b32_e32 v0, v3
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX90A-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_neg:
+; GFX90A-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:-2048 glc
@@ -420,7 +420,7 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_neg(ptr addrspace(1) %
; GFX90A-NEXT: buffer_wbinvl1
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
-; GFX908-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_neg:
+; GFX908-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:-2048
@@ -442,7 +442,7 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_neg(ptr addrspace(1) %
; GFX908-NEXT: v_mov_b32_e32 v0, v3
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
-; GFX8-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_neg:
+; GFX8-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0xfffff800, v0
@@ -465,7 +465,7 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_neg(ptr addrspace(1) %
; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX7-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_neg:
+; GFX7-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: s_movk_i32 s4, 0xf800
@@ -498,7 +498,7 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_neg(ptr addrspace(1) %
; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
-; GFX6-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_neg:
+; GFX6-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: s_movk_i32 s4, 0xf800
@@ -533,12 +533,12 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_neg(ptr addrspace(1) %
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr float, ptr addrspace(1) %ptr, i64 -512
- %result = atomicrmw fadd ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst
+ %result = atomicrmw fadd ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
ret float %result
}
-define void @global_agent_atomic_fadd_noret_f32(ptr addrspace(1) %ptr, float %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fadd_noret_f32:
+define void @global_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: s_wait_expcnt 0x0
@@ -551,7 +551,7 @@ define void @global_agent_atomic_fadd_noret_f32(ptr addrspace(1) %ptr, float %va
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
-; GFX940-LABEL: global_agent_atomic_fadd_noret_f32:
+; GFX940-LABEL: global_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_wbl2 sc1
@@ -560,7 +560,7 @@ define void @global_agent_atomic_fadd_noret_f32(ptr addrspace(1) %ptr, float %va
; GFX940-NEXT: buffer_inv sc1
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: global_agent_atomic_fadd_noret_f32:
+; GFX11-LABEL: global_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
@@ -570,7 +570,7 @@ define void @global_agent_atomic_fadd_noret_f32(ptr addrspace(1) %ptr, float %va
; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
-; GFX10-LABEL: global_agent_atomic_fadd_noret_f32:
+; GFX10-LABEL: global_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: global_load_dword v4, v[0:1], off
@@ -593,7 +593,7 @@ define void @global_agent_atomic_fadd_noret_f32(ptr addrspace(1) %ptr, float %va
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX90A-LABEL: global_agent_atomic_fadd_noret_f32:
+; GFX90A-LABEL: global_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: global_atomic_add_f32 v[0:1], v2, off
@@ -601,7 +601,7 @@ define void @global_agent_atomic_fadd_noret_f32(ptr addrspace(1) %ptr, float %va
; GFX90A-NEXT: buffer_wbinvl1
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
-; GFX908-LABEL: global_agent_atomic_fadd_noret_f32:
+; GFX908-LABEL: global_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: global_atomic_add_f32 v[0:1], v2, off
@@ -609,7 +609,7 @@ define void @global_agent_atomic_fadd_noret_f32(ptr addrspace(1) %ptr, float %va
; GFX908-NEXT: buffer_wbinvl1
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
-; GFX8-LABEL: global_agent_atomic_fadd_noret_f32:
+; GFX8-LABEL: global_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: flat_load_dword v4, v[0:1]
@@ -630,7 +630,7 @@ define void @global_agent_atomic_fadd_noret_f32(ptr addrspace(1) %ptr, float %va
; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX7-LABEL: global_agent_atomic_fadd_noret_f32:
+; GFX7-LABEL: global_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: s_mov_b32 s6, 0
@@ -657,7 +657,7 @@ define void @global_agent_atomic_fadd_noret_f32(ptr addrspace(1) %ptr, float %va
; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
-; GFX6-LABEL: global_agent_atomic_fadd_noret_f32:
+; GFX6-LABEL: global_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: s_mov_b32 s6, 0
@@ -685,12 +685,12 @@ define void @global_agent_atomic_fadd_noret_f32(ptr addrspace(1) %ptr, float %va
; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
- %unused = atomicrmw fadd ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst
+ %unused = atomicrmw fadd ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
ret void
}
-define void @global_agent_atomic_fadd_noret_f32__offset12b_pos(ptr addrspace(1) %ptr, float %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos:
+define void @global_agent_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: s_wait_expcnt 0x0
@@ -703,7 +703,7 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_pos(ptr addrspace(1)
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
-; GFX940-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos:
+; GFX940-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_wbl2 sc1
@@ -712,7 +712,7 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_pos(ptr addrspace(1)
; GFX940-NEXT: buffer_inv sc1
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos:
+; GFX11-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
@@ -722,7 +722,7 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_pos(ptr addrspace(1)
; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
-; GFX10-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos:
+; GFX10-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:2044
@@ -745,7 +745,7 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_pos(ptr addrspace(1)
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX90A-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos:
+; GFX90A-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044
@@ -753,7 +753,7 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_pos(ptr addrspace(1)
; GFX90A-NEXT: buffer_wbinvl1
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
-; GFX908-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos:
+; GFX908-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044
@@ -761,7 +761,7 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_pos(ptr addrspace(1)
; GFX908-NEXT: buffer_wbinvl1
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
-; GFX8-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos:
+; GFX8-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0
@@ -784,7 +784,7 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_pos(ptr addrspace(1)
; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX7-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos:
+; GFX7-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: s_mov_b32 s6, 0
@@ -811,7 +811,7 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_pos(ptr addrspace(1)
; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
-; GFX6-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos:
+; GFX6-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: s_mov_b32 s6, 0
@@ -840,12 +840,12 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_pos(ptr addrspace(1)
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr float, ptr addrspace(1) %ptr, i64 511
- %unused = atomicrmw fadd ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst
+ %unused = atomicrmw fadd ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
ret void
}
-define void @global_agent_atomic_fadd_noret_f32__offset12b_neg(ptr addrspace(1) %ptr, float %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_neg:
+define void @global_agent_atomic_fadd_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: s_wait_expcnt 0x0
@@ -858,7 +858,7 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_neg(ptr addrspace(1)
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
-; GFX940-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_neg:
+; GFX940-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_wbl2 sc1
@@ -867,7 +867,7 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_neg(ptr addrspace(1)
; GFX940-NEXT: buffer_inv sc1
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_neg:
+; GFX11-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
@@ -877,7 +877,7 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_neg(ptr addrspace(1)
; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
-; GFX10-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_neg:
+; GFX10-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:-2048
@@ -900,7 +900,7 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_neg(ptr addrspace(1)
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX90A-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_neg:
+; GFX90A-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:-2048
@@ -908,7 +908,7 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_neg(ptr addrspace(1)
; GFX90A-NEXT: buffer_wbinvl1
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
-; GFX908-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_neg:
+; GFX908-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:-2048
@@ -916,7 +916,7 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_neg(ptr addrspace(1)
; GFX908-NEXT: buffer_wbinvl1
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
-; GFX8-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_neg:
+; GFX8-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0xfffff800, v0
@@ -939,7 +939,7 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_neg(ptr addrspace(1)
; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX7-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_neg:
+; GFX7-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: s_movk_i32 s4, 0xf800
@@ -970,7 +970,7 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_neg(ptr addrspace(1)
; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
-; GFX6-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_neg:
+; GFX6-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: s_movk_i32 s4, 0xf800
@@ -1003,12 +1003,12 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_neg(ptr addrspace(1)
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr float, ptr addrspace(1) %ptr, i64 -512
- %unused = atomicrmw fadd ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst
+ %unused = atomicrmw fadd ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
ret void
}
-define float @global_system_atomic_fadd_ret_f32__offset12b_pos(ptr addrspace(1) %ptr, float %val) #0 {
-; GFX12-LABEL: global_system_atomic_fadd_ret_f32__offset12b_pos:
+define float @global_system_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #0 {
+; GFX12-LABEL: global_system_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: s_wait_expcnt 0x0
@@ -1037,7 +1037,7 @@ define float @global_system_atomic_fadd_ret_f32__offset12b_pos(ptr addrspace(1)
; GFX12-NEXT: v_mov_b32_e32 v0, v3
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
-; GFX940-LABEL: global_system_atomic_fadd_ret_f32__offset12b_pos:
+; GFX940-LABEL: global_system_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_wbl2 sc0 sc1
@@ -1046,7 +1046,7 @@ define float @global_system_atomic_fadd_ret_f32__offset12b_pos(ptr addrspace(1)
; GFX940-NEXT: buffer_inv sc0 sc1
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: global_system_atomic_fadd_ret_f32__offset12b_pos:
+; GFX11-LABEL: global_system_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044
@@ -1072,7 +1072,7 @@ define float @global_system_atomic_fadd_ret_f32__offset12b_pos(ptr addrspace(1)
; GFX11-NEXT: v_mov_b32_e32 v0, v3
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
-; GFX10-LABEL: global_system_atomic_fadd_ret_f32__offset12b_pos:
+; GFX10-LABEL: global_system_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044
@@ -1096,7 +1096,7 @@ define float @global_system_atomic_fadd_ret_f32__offset12b_pos(ptr addrspace(1)
; GFX10-NEXT: v_mov_b32_e32 v0, v3
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX90A-LABEL: global_system_atomic_fadd_ret_f32__offset12b_pos:
+; GFX90A-LABEL: global_system_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044
@@ -1120,7 +1120,7 @@ define float @global_system_atomic_fadd_ret_f32__offset12b_pos(ptr addrspace(1)
; GFX90A-NEXT: v_mov_b32_e32 v0, v3
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
-; GFX908-LABEL: global_system_atomic_fadd_ret_f32__offset12b_pos:
+; GFX908-LABEL: global_system_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044
@@ -1142,7 +1142,7 @@ define float @global_system_atomic_fadd_ret_f32__offset12b_pos(ptr addrspace(1)
; GFX908-NEXT: v_mov_b32_e32 v0, v3
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
-; GFX8-LABEL: global_system_atomic_fadd_ret_f32__offset12b_pos:
+; GFX8-LABEL: global_system_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fc, v0
@@ -1165,7 +1165,7 @@ define float @global_system_atomic_fadd_ret_f32__offset12b_pos(ptr addrspace(1)
; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX7-LABEL: global_system_atomic_fadd_ret_f32__offset12b_pos:
+; GFX7-LABEL: global_system_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: s_mov_b32 s6, 0
@@ -1193,7 +1193,7 @@ define float @global_system_atomic_fadd_ret_f32__offset12b_pos(ptr addrspace(1)
; GFX7-NEXT: v_mov_b32_e32 v0, v3
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
-; GFX6-LABEL: global_system_atomic_fadd_ret_f32__offset12b_pos:
+; GFX6-LABEL: global_system_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: s_mov_b32 s6, 0
@@ -1223,12 +1223,12 @@ define float @global_system_atomic_fadd_ret_f32__offset12b_pos(ptr addrspace(1)
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr float, ptr addrspace(1) %ptr, i64 511
- %result = atomicrmw fadd ptr addrspace(1) %gep, float %val seq_cst
+ %result = atomicrmw fadd ptr addrspace(1) %gep, float %val seq_cst, !amdgpu.no.fine.grained.memory !0
ret float %result
}
-define void @global_system_atomic_fadd_noret_f32__offset12b_pos(ptr addrspace(1) %ptr, float %val) #0 {
-; GFX12-LABEL: global_system_atomic_fadd_noret_f32__offset12b_pos:
+define void @global_system_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #0 {
+; GFX12-LABEL: global_system_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: s_wait_expcnt 0x0
@@ -1255,7 +1255,7 @@ define void @global_system_atomic_fadd_noret_f32__offset12b_pos(ptr addrspace(1)
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
-; GFX940-LABEL: global_system_atomic_fadd_noret_f32__offset12b_pos:
+; GFX940-LABEL: global_system_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_wbl2 sc0 sc1
@@ -1264,7 +1264,7 @@ define void @global_system_atomic_fadd_noret_f32__offset12b_pos(ptr addrspace(1)
; GFX940-NEXT: buffer_inv sc0 sc1
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: global_system_atomic_fadd_noret_f32__offset12b_pos:
+; GFX11-LABEL: global_system_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: global_load_b32 v4, v[0:1], off offset:2044
@@ -1288,7 +1288,7 @@ define void @global_system_atomic_fadd_noret_f32__offset12b_pos(ptr addrspace(1)
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
-; GFX10-LABEL: global_system_atomic_fadd_noret_f32__offset12b_pos:
+; GFX10-LABEL: global_system_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:2044
@@ -1311,7 +1311,7 @@ define void @global_system_atomic_fadd_noret_f32__offset12b_pos(ptr addrspace(1)
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX90A-LABEL: global_system_atomic_fadd_noret_f32__offset12b_pos:
+; GFX90A-LABEL: global_system_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:2044
@@ -1334,7 +1334,7 @@ define void @global_system_atomic_fadd_noret_f32__offset12b_pos(ptr addrspace(1)
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
-; GFX908-LABEL: global_system_atomic_fadd_noret_f32__offset12b_pos:
+; GFX908-LABEL: global_system_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: global_load_dword v4, v[0:1], off offset:2044
@@ -1355,7 +1355,7 @@ define void @global_system_atomic_fadd_noret_f32__offset12b_pos(ptr addrspace(1)
; GFX908-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
-; GFX8-LABEL: global_system_atomic_fadd_noret_f32__offset12b_pos:
+; GFX8-LABEL: global_system_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0
@@ -1378,7 +1378,7 @@ define void @global_system_atomic_fadd_noret_f32__offset12b_pos(ptr addrspace(1)
; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX7-LABEL: global_system_atomic_fadd_noret_f32__offset12b_pos:
+; GFX7-LABEL: global_system_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: s_mov_b32 s6, 0
@@ -1405,7 +1405,7 @@ define void @global_system_atomic_fadd_noret_f32__offset12b_pos(ptr addrspace(1)
; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
-; GFX6-LABEL: global_system_atomic_fadd_noret_f32__offset12b_pos:
+; GFX6-LABEL: global_system_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: s_mov_b32 s6, 0
@@ -1434,16 +1434,12 @@ define void @global_system_atomic_fadd_noret_f32__offset12b_pos(ptr addrspace(1)
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr float, ptr addrspace(1) %ptr, i64 511
- %unused = atomicrmw fadd ptr addrspace(1) %gep, float %val seq_cst
+ %unused = atomicrmw fadd ptr addrspace(1) %gep, float %val seq_cst, !amdgpu.no.fine.grained.memory !0
ret void
}
-; --------------------------------------------------------------------
-; float with ftz/daz
-; --------------------------------------------------------------------
-
-define float @global_agent_atomic_fadd_ret_f32__ftz(ptr addrspace(1) %ptr, float %val) #1 {
-; GFX12-LABEL: global_agent_atomic_fadd_ret_f32__ftz:
+define float @global_agent_atomic_fadd_ret_f32_maybe_remote(ptr addrspace(1) %ptr, float %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fadd_ret_f32_maybe_remote:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: s_wait_expcnt 0x0
@@ -1451,34 +1447,34 @@ define float @global_agent_atomic_fadd_ret_f32__ftz(ptr addrspace(1) %ptr, float
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN
+; GFX12-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
-; GFX940-LABEL: global_agent_atomic_fadd_ret_f32__ftz:
+; GFX940-LABEL: global_agent_atomic_fadd_ret_f32_maybe_remote:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off sc0
+; GFX940-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:2044 sc0
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc1
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: global_agent_atomic_fadd_ret_f32__ftz:
+; GFX11-LABEL: global_agent_atomic_fadd_ret_f32_maybe_remote:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off glc
+; GFX11-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:2044 glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
-; GFX10-LABEL: global_agent_atomic_fadd_ret_f32__ftz:
+; GFX10-LABEL: global_agent_atomic_fadd_ret_f32_maybe_remote:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v3, v[0:1], off
+; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044
; GFX10-NEXT: s_mov_b32 s4, 0
; GFX10-NEXT: .LBB8_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -1486,7 +1482,7 @@ define float @global_agent_atomic_fadd_ret_f32__ftz(ptr addrspace(1) %ptr, float
; GFX10-NEXT: v_mov_b32_e32 v4, v3
; GFX10-NEXT: v_add_f32_e32 v3, v4, v2
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
+; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
@@ -1499,25 +1495,25 @@ define float @global_agent_atomic_fadd_ret_f32__ftz(ptr addrspace(1) %ptr, float
; GFX10-NEXT: v_mov_b32_e32 v0, v3
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX90A-LABEL: global_agent_atomic_fadd_ret_f32__ftz:
+; GFX90A-LABEL: global_agent_atomic_fadd_ret_f32_maybe_remote:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off glc
+; GFX90A-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:2044 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
-; GFX908-LABEL: global_agent_atomic_fadd_ret_f32__ftz:
+; GFX908-LABEL: global_agent_atomic_fadd_ret_f32_maybe_remote:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: global_load_dword v3, v[0:1], off
+; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044
; GFX908-NEXT: s_mov_b64 s[4:5], 0
; GFX908-NEXT: .LBB8_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: v_mov_b32_e32 v4, v3
; GFX908-NEXT: v_add_f32_e32 v3, v4, v2
-; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
+; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
@@ -1529,36 +1525,37 @@ define float @global_agent_atomic_fadd_ret_f32__ftz(ptr addrspace(1) %ptr, float
; GFX908-NEXT: v_mov_b32_e32 v0, v3
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
-; GFX8-LABEL: global_agent_atomic_fadd_ret_f32__ftz:
+; GFX8-LABEL: global_agent_atomic_fadd_ret_f32_maybe_remote:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: flat_load_dword v3, v[0:1]
+; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fc, v0
+; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc
+; GFX8-NEXT: flat_load_dword v0, v[3:4]
; GFX8-NEXT: s_mov_b64 s[4:5], 0
; GFX8-NEXT: .LBB8_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v4, v3
-; GFX8-NEXT: v_add_f32_e32 v3, v4, v2
-; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT: v_mov_b32_e32 v1, v0
+; GFX8-NEXT: v_add_f32_e32 v0, v1, v2
+; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB8_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX8-NEXT: v_mov_b32_e32 v0, v3
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX7-LABEL: global_agent_atomic_fadd_ret_f32__ftz:
+; GFX7-LABEL: global_agent_atomic_fadd_ret_f32_maybe_remote:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: s_mov_b32 s6, 0
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
-; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044
; GFX7-NEXT: s_mov_b64 s[8:9], 0
; GFX7-NEXT: .LBB8_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -1567,7 +1564,7 @@ define float @global_agent_atomic_fadd_ret_f32__ftz(ptr addrspace(1) %ptr, float
; GFX7-NEXT: v_add_f32_e32 v4, v5, v2
; GFX7-NEXT: v_mov_b32_e32 v3, v4
; GFX7-NEXT: v_mov_b32_e32 v4, v5
-; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
@@ -1579,14 +1576,14 @@ define float @global_agent_atomic_fadd_ret_f32__ftz(ptr addrspace(1) %ptr, float
; GFX7-NEXT: v_mov_b32_e32 v0, v3
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
-; GFX6-LABEL: global_agent_atomic_fadd_ret_f32__ftz:
+; GFX6-LABEL: global_agent_atomic_fadd_ret_f32_maybe_remote:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: s_mov_b32 s6, 0
; GFX6-NEXT: s_mov_b32 s7, 0xf000
; GFX6-NEXT: s_mov_b32 s4, s6
; GFX6-NEXT: s_mov_b32 s5, s6
-; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
+; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044
; GFX6-NEXT: s_mov_b64 s[8:9], 0
; GFX6-NEXT: .LBB8_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -1596,7 +1593,7 @@ define float @global_agent_atomic_fadd_ret_f32__ftz(ptr addrspace(1) %ptr, float
; GFX6-NEXT: v_add_f32_e32 v4, v5, v2
; GFX6-NEXT: v_mov_b32_e32 v3, v4
; GFX6-NEXT: v_mov_b32_e32 v4, v5
-; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
@@ -1608,12 +1605,13 @@ define float @global_agent_atomic_fadd_ret_f32__ftz(ptr addrspace(1) %ptr, float
; GFX6-NEXT: v_mov_b32_e32 v0, v3
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
- %result = atomicrmw fadd ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst
+ %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511
+ %result = atomicrmw fadd ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst
ret float %result
}
-define float @global_agent_atomic_fadd_ret_f32__offset12b_pos__ftz(ptr addrspace(1) %ptr, float %val) #1 {
-; GFX12-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos__ftz:
+define float @global_agent_atomic_fadd_ret_f32_maybe_remote__amdgpu_ignore_denormal_mode(ptr addrspace(1) %ptr, float %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fadd_ret_f32_maybe_remote__amdgpu_ignore_denormal_mode:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: s_wait_expcnt 0x0
@@ -1626,7 +1624,7 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_pos__ftz(ptr addrspace
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
-; GFX940-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos__ftz:
+; GFX940-LABEL: global_agent_atomic_fadd_ret_f32_maybe_remote__amdgpu_ignore_denormal_mode:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_wbl2 sc1
@@ -1635,7 +1633,7 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_pos__ftz(ptr addrspace
; GFX940-NEXT: buffer_inv sc1
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos__ftz:
+; GFX11-LABEL: global_agent_atomic_fadd_ret_f32_maybe_remote__amdgpu_ignore_denormal_mode:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
@@ -1645,7 +1643,7 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_pos__ftz(ptr addrspace
; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
-; GFX10-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos__ftz:
+; GFX10-LABEL: global_agent_atomic_fadd_ret_f32_maybe_remote__amdgpu_ignore_denormal_mode:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044
@@ -1669,7 +1667,7 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_pos__ftz(ptr addrspace
; GFX10-NEXT: v_mov_b32_e32 v0, v3
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX90A-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos__ftz:
+; GFX90A-LABEL: global_agent_atomic_fadd_ret_f32_maybe_remote__amdgpu_ignore_denormal_mode:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:2044 glc
@@ -1677,7 +1675,7 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_pos__ftz(ptr addrspace
; GFX90A-NEXT: buffer_wbinvl1
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
-; GFX908-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos__ftz:
+; GFX908-LABEL: global_agent_atomic_fadd_ret_f32_maybe_remote__amdgpu_ignore_denormal_mode:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044
@@ -1699,7 +1697,7 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_pos__ftz(ptr addrspace
; GFX908-NEXT: v_mov_b32_e32 v0, v3
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
-; GFX8-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos__ftz:
+; GFX8-LABEL: global_agent_atomic_fadd_ret_f32_maybe_remote__amdgpu_ignore_denormal_mode:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fc, v0
@@ -1722,7 +1720,7 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_pos__ftz(ptr addrspace
; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX7-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos__ftz:
+; GFX7-LABEL: global_agent_atomic_fadd_ret_f32_maybe_remote__amdgpu_ignore_denormal_mode:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: s_mov_b32 s6, 0
@@ -1750,7 +1748,7 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_pos__ftz(ptr addrspace
; GFX7-NEXT: v_mov_b32_e32 v0, v3
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
-; GFX6-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos__ftz:
+; GFX6-LABEL: global_agent_atomic_fadd_ret_f32_maybe_remote__amdgpu_ignore_denormal_mode:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: s_mov_b32 s6, 0
@@ -1780,12 +1778,12 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_pos__ftz(ptr addrspace
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr float, ptr addrspace(1) %ptr, i64 511
- %result = atomicrmw fadd ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst
+ %result = atomicrmw fadd ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst, !amdgpu.ignore.denormal.mode !0
ret float %result
}
-define float @global_agent_atomic_fadd_ret_f32__offset12b_neg__ftz(ptr addrspace(1) %ptr, float %val) #1 {
-; GFX12-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_neg__ftz:
+define void @global_agent_atomic_fadd_noret_f32_maybe_remote__amdgpu_ignore_denormal_mode(ptr addrspace(1) %ptr, float %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fadd_noret_f32_maybe_remote__amdgpu_ignore_denormal_mode:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: s_wait_expcnt 0x0
@@ -1793,181 +1791,154 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_neg__ftz(ptr addrspace
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:-2048 th:TH_ATOMIC_RETURN
-; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044
+; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
-; GFX940-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_neg__ftz:
+; GFX940-LABEL: global_agent_atomic_fadd_noret_f32_maybe_remote__amdgpu_ignore_denormal_mode:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:-2048 sc0
+; GFX940-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc1
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_neg__ftz:
+; GFX11-LABEL: global_agent_atomic_fadd_noret_f32_maybe_remote__amdgpu_ignore_denormal_mode:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:-2048 glc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
-; GFX10-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_neg__ftz:
+; GFX10-LABEL: global_agent_atomic_fadd_noret_f32_maybe_remote__amdgpu_ignore_denormal_mode:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:-2048
+; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:2044
; GFX10-NEXT: s_mov_b32 s4, 0
; GFX10-NEXT: .LBB10_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v4, v3
; GFX10-NEXT: v_add_f32_e32 v3, v4, v2
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc
+; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX10-NEXT: v_mov_b32_e32 v4, v3
; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_cbranch_execnz .LBB10_1
; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: v_mov_b32_e32 v0, v3
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX90A-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_neg__ftz:
+; GFX90A-LABEL: global_agent_atomic_fadd_noret_f32_maybe_remote__amdgpu_ignore_denormal_mode:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:-2048 glc
+; GFX90A-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
-; GFX908-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_neg__ftz:
+; GFX908-LABEL: global_agent_atomic_fadd_noret_f32_maybe_remote__amdgpu_ignore_denormal_mode:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:-2048
-; GFX908-NEXT: s_mov_b64 s[4:5], 0
-; GFX908-NEXT: .LBB10_1: ; %atomicrmw.start
-; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v4, v3
-; GFX908-NEXT: v_add_f32_e32 v3, v4, v2
-; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc
+; GFX908-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
-; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT: s_cbranch_execnz .LBB10_1
-; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX908-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX908-NEXT: v_mov_b32_e32 v0, v3
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
-; GFX8-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_neg__ftz:
+; GFX8-LABEL: global_agent_atomic_fadd_noret_f32_maybe_remote__amdgpu_ignore_denormal_mode:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0xfffff800, v0
-; GFX8-NEXT: v_addc_u32_e32 v4, vcc, -1, v1, vcc
-; GFX8-NEXT: flat_load_dword v0, v[3:4]
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT: flat_load_dword v4, v[0:1]
; GFX8-NEXT: s_mov_b64 s[4:5], 0
; GFX8-NEXT: .LBB10_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, v0
-; GFX8-NEXT: v_add_f32_e32 v0, v1, v2
-; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
+; GFX8-NEXT: v_add_f32_e32 v3, v4, v2
+; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT: v_mov_b32_e32 v4, v3
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB10_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX7-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_neg__ftz:
+; GFX7-LABEL: global_agent_atomic_fadd_noret_f32_maybe_remote__amdgpu_ignore_denormal_mode:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: s_movk_i32 s4, 0xf800
-; GFX7-NEXT: v_mov_b32_e32 v4, v1
-; GFX7-NEXT: v_mov_b32_e32 v3, v0
-; GFX7-NEXT: s_mov_b32 s5, -1
-; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s6, 0
-; GFX7-NEXT: buffer_load_dword v0, v[3:4], s[4:7], 0 addr64
-; GFX7-NEXT: v_add_i32_e32 v3, vcc, 0xfffff800, v3
-; GFX7-NEXT: v_addc_u32_e32 v4, vcc, -1, v4, vcc
-; GFX7-NEXT: s_mov_b64 s[8:9], 0
+; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
+; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:2044
+; GFX7-NEXT: s_mov_b64 s[8:9], 0
; GFX7-NEXT: .LBB10_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v6, v0
-; GFX7-NEXT: v_add_f32_e32 v5, v6, v2
-; GFX7-NEXT: v_mov_b32_e32 v0, v5
-; GFX7-NEXT: v_mov_b32_e32 v1, v6
-; GFX7-NEXT: buffer_atomic_cmpswap v[0:1], v[3:4], s[4:7], 0 addr64 glc
+; GFX7-NEXT: v_add_f32_e32 v3, v4, v2
+; GFX7-NEXT: v_mov_b32_e32 v6, v4
+; GFX7-NEXT: v_mov_b32_e32 v5, v3
+; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2044 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4
; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT: v_mov_b32_e32 v4, v5
; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_cbranch_execnz .LBB10_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
-; GFX6-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_neg__ftz:
+; GFX6-LABEL: global_agent_atomic_fadd_noret_f32_maybe_remote__amdgpu_ignore_denormal_mode:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT: s_movk_i32 s4, 0xf800
-; GFX6-NEXT: v_mov_b32_e32 v4, v1
-; GFX6-NEXT: v_mov_b32_e32 v3, v0
-; GFX6-NEXT: s_mov_b32 s5, -1
-; GFX6-NEXT: s_mov_b32 s7, 0xf000
; GFX6-NEXT: s_mov_b32 s6, 0
-; GFX6-NEXT: buffer_load_dword v0, v[3:4], s[4:7], 0 addr64
-; GFX6-NEXT: v_add_i32_e32 v3, vcc, 0xfffff800, v3
-; GFX6-NEXT: v_addc_u32_e32 v4, vcc, -1, v4, vcc
-; GFX6-NEXT: s_mov_b64 s[8:9], 0
+; GFX6-NEXT: s_mov_b32 s7, 0xf000
; GFX6-NEXT: s_mov_b32 s4, s6
; GFX6-NEXT: s_mov_b32 s5, s6
+; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:2044
+; GFX6-NEXT: s_mov_b64 s[8:9], 0
; GFX6-NEXT: .LBB10_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v6, v0
-; GFX6-NEXT: v_add_f32_e32 v5, v6, v2
+; GFX6-NEXT: v_add_f32_e32 v3, v4, v2
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v0, v5
-; GFX6-NEXT: v_mov_b32_e32 v1, v6
-; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], v[3:4], s[4:7], 0 addr64 glc
+; GFX6-NEXT: v_mov_b32_e32 v6, v4
+; GFX6-NEXT: v_mov_b32_e32 v5, v3
+; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2044 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4
; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT: v_mov_b32_e32 v4, v5
; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_cbranch_execnz .LBB10_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
- %gep = getelementptr float, ptr addrspace(1) %ptr, i64 -512
- %result = atomicrmw fadd ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst
- ret float %result
+ %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511
+ %unused = atomicrmw fadd ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst, !amdgpu.ignore.denormal.mode !0
+ ret void
}
-define void @global_agent_atomic_fadd_noret_f32__ftz(ptr addrspace(1) %ptr, float %val) #1 {
-; GFX12-LABEL: global_agent_atomic_fadd_noret_f32__ftz:
+define float @global_agent_atomic_fadd_ret_f32___amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fadd_ret_f32___amdgpu_no_fine_grained_memory:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: s_wait_expcnt 0x0
@@ -1975,151 +1946,171 @@ define void @global_agent_atomic_fadd_noret_f32__ftz(ptr addrspace(1) %ptr, floa
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_add_f32 v[0:1], v2, off
-; GFX12-NEXT: s_wait_storecnt 0x0
+; GFX12-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN
+; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
-; GFX940-LABEL: global_agent_atomic_fadd_noret_f32__ftz:
+; GFX940-LABEL: global_agent_atomic_fadd_ret_f32___amdgpu_no_fine_grained_memory:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: global_atomic_add_f32 v[0:1], v2, off
+; GFX940-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:2044 sc0
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc1
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: global_agent_atomic_fadd_noret_f32__ftz:
+; GFX11-LABEL: global_agent_atomic_fadd_ret_f32___amdgpu_no_fine_grained_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_add_f32 v[0:1], v2, off
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:2044 glc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
-; GFX10-LABEL: global_agent_atomic_fadd_noret_f32__ftz:
+; GFX10-LABEL: global_agent_atomic_fadd_ret_f32___amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v4, v[0:1], off
+; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044
; GFX10-NEXT: s_mov_b32 s4, 0
; GFX10-NEXT: .LBB11_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_mov_b32_e32 v4, v3
; GFX10-NEXT: v_add_f32_e32 v3, v4, v2
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
+; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX10-NEXT: v_mov_b32_e32 v4, v3
; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_cbranch_execnz .LBB11_1
; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT: v_mov_b32_e32 v0, v3
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX90A-LABEL: global_agent_atomic_fadd_noret_f32__ftz:
+; GFX90A-LABEL: global_agent_atomic_fadd_ret_f32___amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: global_atomic_add_f32 v[0:1], v2, off
+; GFX90A-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:2044 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
-; GFX908-LABEL: global_agent_atomic_fadd_noret_f32__ftz:
+; GFX908-LABEL: global_agent_atomic_fadd_ret_f32___amdgpu_no_fine_grained_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: global_atomic_add_f32 v[0:1], v2, off
+; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044
+; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: .LBB11_1: ; %atomicrmw.start
+; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: v_mov_b32_e32 v4, v3
+; GFX908-NEXT: v_add_f32_e32 v3, v4, v2
+; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX908-NEXT: s_cbranch_execnz .LBB11_1
+; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT: v_mov_b32_e32 v0, v3
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
-; GFX8-LABEL: global_agent_atomic_fadd_noret_f32__ftz:
+; GFX8-LABEL: global_agent_atomic_fadd_ret_f32___amdgpu_no_fine_grained_memory:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: flat_load_dword v4, v[0:1]
+; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fc, v0
+; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc
+; GFX8-NEXT: flat_load_dword v0, v[3:4]
; GFX8-NEXT: s_mov_b64 s[4:5], 0
; GFX8-NEXT: .LBB11_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_add_f32_e32 v3, v4, v2
-; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT: v_mov_b32_e32 v1, v0
+; GFX8-NEXT: v_add_f32_e32 v0, v1, v2
+; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT: v_mov_b32_e32 v4, v3
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB11_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX7-LABEL: global_agent_atomic_fadd_noret_f32__ftz:
+; GFX7-LABEL: global_agent_atomic_fadd_ret_f32___amdgpu_no_fine_grained_memory:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: s_mov_b32 s6, 0
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
-; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044
; GFX7-NEXT: s_mov_b64 s[8:9], 0
; GFX7-NEXT: .LBB11_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_add_f32_e32 v3, v4, v2
-; GFX7-NEXT: v_mov_b32_e32 v6, v4
; GFX7-NEXT: v_mov_b32_e32 v5, v3
-; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT: v_add_f32_e32 v4, v5, v2
+; GFX7-NEXT: v_mov_b32_e32 v3, v4
+; GFX7-NEXT: v_mov_b32_e32 v4, v5
+; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT: v_mov_b32_e32 v4, v5
; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_cbranch_execnz .LBB11_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
+; GFX7-NEXT: v_mov_b32_e32 v0, v3
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
-; GFX6-LABEL: global_agent_atomic_fadd_noret_f32__ftz:
+; GFX6-LABEL: global_agent_atomic_fadd_ret_f32___amdgpu_no_fine_grained_memory:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: s_mov_b32 s6, 0
; GFX6-NEXT: s_mov_b32 s7, 0xf000
; GFX6-NEXT: s_mov_b32 s4, s6
; GFX6-NEXT: s_mov_b32 s5, s6
-; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64
+; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044
; GFX6-NEXT: s_mov_b64 s[8:9], 0
; GFX6-NEXT: .LBB11_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_add_f32_e32 v3, v4, v2
-; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v6, v4
; GFX6-NEXT: v_mov_b32_e32 v5, v3
-; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_add_f32_e32 v4, v5, v2
+; GFX6-NEXT: v_mov_b32_e32 v3, v4
+; GFX6-NEXT: v_mov_b32_e32 v4, v5
+; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT: v_mov_b32_e32 v4, v5
; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_cbranch_execnz .LBB11_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
+; GFX6-NEXT: v_mov_b32_e32 v0, v3
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
- %unused = atomicrmw fadd ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst
- ret void
+ %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511
+ %result = atomicrmw fadd ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+ ret float %result
}
-define void @global_agent_atomic_fadd_noret_f32__offset12b_pos__ftz(ptr addrspace(1) %ptr, float %val) #1 {
-; GFX12-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos__ftz:
+define float @global_agent_atomic_fadd_ret_f32___amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode(ptr addrspace(1) %ptr, float %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fadd_ret_f32___amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: s_wait_expcnt 0x0
@@ -2127,38 +2118,39 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_pos__ftz(ptr addrspac
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044
-; GFX12-NEXT: s_wait_storecnt 0x0
+; GFX12-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN
+; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
-; GFX940-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos__ftz:
+; GFX940-LABEL: global_agent_atomic_fadd_ret_f32___amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044
+; GFX940-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:2044 sc0
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc1
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos__ftz:
+; GFX11-LABEL: global_agent_atomic_fadd_ret_f32___amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:2044 glc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
-; GFX10-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos__ftz:
+; GFX10-LABEL: global_agent_atomic_fadd_ret_f32___amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:2044
+; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044
; GFX10-NEXT: s_mov_b32 s4, 0
; GFX10-NEXT: .LBB12_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_mov_b32_e32 v4, v3
; GFX10-NEXT: v_add_f32_e32 v3, v4, v2
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
@@ -2166,115 +2158,131 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_pos__ftz(ptr addrspac
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX10-NEXT: v_mov_b32_e32 v4, v3
; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_cbranch_execnz .LBB12_1
; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT: v_mov_b32_e32 v0, v3
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX90A-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos__ftz:
+; GFX90A-LABEL: global_agent_atomic_fadd_ret_f32___amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044
+; GFX90A-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:2044 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
-; GFX908-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos__ftz:
+; GFX908-LABEL: global_agent_atomic_fadd_ret_f32___amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044
+; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044
+; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: .LBB12_1: ; %atomicrmw.start
+; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: v_mov_b32_e32 v4, v3
+; GFX908-NEXT: v_add_f32_e32 v3, v4, v2
+; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX908-NEXT: s_cbranch_execnz .LBB12_1
+; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT: v_mov_b32_e32 v0, v3
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
-; GFX8-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos__ftz:
+; GFX8-LABEL: global_agent_atomic_fadd_ret_f32___amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0
-; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_load_dword v4, v[0:1]
+; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fc, v0
+; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc
+; GFX8-NEXT: flat_load_dword v0, v[3:4]
; GFX8-NEXT: s_mov_b64 s[4:5], 0
; GFX8-NEXT: .LBB12_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_add_f32_e32 v3, v4, v2
-; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT: v_mov_b32_e32 v1, v0
+; GFX8-NEXT: v_add_f32_e32 v0, v1, v2
+; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT: v_mov_b32_e32 v4, v3
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB12_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX7-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos__ftz:
+; GFX7-LABEL: global_agent_atomic_fadd_ret_f32___amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: s_mov_b32 s6, 0
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
-; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:2044
+; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044
; GFX7-NEXT: s_mov_b64 s[8:9], 0
; GFX7-NEXT: .LBB12_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_add_f32_e32 v3, v4, v2
-; GFX7-NEXT: v_mov_b32_e32 v6, v4
; GFX7-NEXT: v_mov_b32_e32 v5, v3
-; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2044 glc
+; GFX7-NEXT: v_add_f32_e32 v4, v5, v2
+; GFX7-NEXT: v_mov_b32_e32 v3, v4
+; GFX7-NEXT: v_mov_b32_e32 v4, v5
+; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT: v_mov_b32_e32 v4, v5
; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_cbranch_execnz .LBB12_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
+; GFX7-NEXT: v_mov_b32_e32 v0, v3
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
-; GFX6-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos__ftz:
+; GFX6-LABEL: global_agent_atomic_fadd_ret_f32___amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: s_mov_b32 s6, 0
; GFX6-NEXT: s_mov_b32 s7, 0xf000
; GFX6-NEXT: s_mov_b32 s4, s6
; GFX6-NEXT: s_mov_b32 s5, s6
-; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:2044
+; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044
; GFX6-NEXT: s_mov_b64 s[8:9], 0
; GFX6-NEXT: .LBB12_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_add_f32_e32 v3, v4, v2
-; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v6, v4
; GFX6-NEXT: v_mov_b32_e32 v5, v3
-; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2044 glc
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_add_f32_e32 v4, v5, v2
+; GFX6-NEXT: v_mov_b32_e32 v3, v4
+; GFX6-NEXT: v_mov_b32_e32 v4, v5
+; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT: v_mov_b32_e32 v4, v5
; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_cbranch_execnz .LBB12_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
+; GFX6-NEXT: v_mov_b32_e32 v0, v3
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr float, ptr addrspace(1) %ptr, i64 511
- %unused = atomicrmw fadd ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst
- ret void
+ %result = atomicrmw fadd ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0
+ ret float %result
}
-define void @global_agent_atomic_fadd_noret_f32__offset12b_neg__ftz(ptr addrspace(1) %ptr, float %val) #1 {
-; GFX12-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_neg__ftz:
+define float @global_agent_atomic_fadd_ret_f32_amdgpu_ignore_denormal_mode(ptr addrspace(1) %ptr, float %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fadd_ret_f32_amdgpu_ignore_denormal_mode:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: s_wait_expcnt 0x0
@@ -2282,234 +2290,210 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_neg__ftz(ptr addrspac
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:-2048
-; GFX12-NEXT: s_wait_storecnt 0x0
+; GFX12-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN
+; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
-; GFX940-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_neg__ftz:
+; GFX940-LABEL: global_agent_atomic_fadd_ret_f32_amdgpu_ignore_denormal_mode:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:-2048
+; GFX940-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:2044 sc0
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc1
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_neg__ftz:
+; GFX11-LABEL: global_agent_atomic_fadd_ret_f32_amdgpu_ignore_denormal_mode:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:-2048
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:2044 glc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
-; GFX10-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_neg__ftz:
+; GFX10-LABEL: global_agent_atomic_fadd_ret_f32_amdgpu_ignore_denormal_mode:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:-2048
+; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044
; GFX10-NEXT: s_mov_b32 s4, 0
; GFX10-NEXT: .LBB13_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_mov_b32_e32 v4, v3
; GFX10-NEXT: v_add_f32_e32 v3, v4, v2
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc
+; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX10-NEXT: v_mov_b32_e32 v4, v3
; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_cbranch_execnz .LBB13_1
; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT: v_mov_b32_e32 v0, v3
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX90A-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_neg__ftz:
+; GFX90A-LABEL: global_agent_atomic_fadd_ret_f32_amdgpu_ignore_denormal_mode:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:-2048
+; GFX90A-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:2044 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
-; GFX908-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_neg__ftz:
+; GFX908-LABEL: global_agent_atomic_fadd_ret_f32_amdgpu_ignore_denormal_mode:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:-2048
+; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044
+; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: .LBB13_1: ; %atomicrmw.start
+; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: v_mov_b32_e32 v4, v3
+; GFX908-NEXT: v_add_f32_e32 v3, v4, v2
+; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX908-NEXT: s_cbranch_execnz .LBB13_1
+; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT: v_mov_b32_e32 v0, v3
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
-; GFX8-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_neg__ftz:
+; GFX8-LABEL: global_agent_atomic_fadd_ret_f32_amdgpu_ignore_denormal_mode:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0xfffff800, v0
-; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc
-; GFX8-NEXT: flat_load_dword v4, v[0:1]
+; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fc, v0
+; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc
+; GFX8-NEXT: flat_load_dword v0, v[3:4]
; GFX8-NEXT: s_mov_b64 s[4:5], 0
; GFX8-NEXT: .LBB13_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_add_f32_e32 v3, v4, v2
-; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT: v_mov_b32_e32 v1, v0
+; GFX8-NEXT: v_add_f32_e32 v0, v1, v2
+; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT: v_mov_b32_e32 v4, v3
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB13_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX7-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_neg__ftz:
+; GFX7-LABEL: global_agent_atomic_fadd_ret_f32_amdgpu_ignore_denormal_mode:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: s_movk_i32 s4, 0xf800
-; GFX7-NEXT: s_mov_b32 s5, -1
-; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s6, 0
-; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0
-; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc
-; GFX7-NEXT: s_mov_b64 s[8:9], 0
+; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
+; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044
+; GFX7-NEXT: s_mov_b64 s[8:9], 0
; GFX7-NEXT: .LBB13_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_add_f32_e32 v3, v4, v2
-; GFX7-NEXT: v_mov_b32_e32 v6, v4
; GFX7-NEXT: v_mov_b32_e32 v5, v3
-; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT: v_add_f32_e32 v4, v5, v2
+; GFX7-NEXT: v_mov_b32_e32 v3, v4
+; GFX7-NEXT: v_mov_b32_e32 v4, v5
+; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT: v_mov_b32_e32 v4, v5
; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_cbranch_execnz .LBB13_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
+; GFX7-NEXT: v_mov_b32_e32 v0, v3
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
-; GFX6-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_neg__ftz:
+; GFX6-LABEL: global_agent_atomic_fadd_ret_f32_amdgpu_ignore_denormal_mode:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT: s_movk_i32 s4, 0xf800
-; GFX6-NEXT: s_mov_b32 s5, -1
-; GFX6-NEXT: s_mov_b32 s7, 0xf000
; GFX6-NEXT: s_mov_b32 s6, 0
-; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64
-; GFX6-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0
-; GFX6-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc
-; GFX6-NEXT: s_mov_b64 s[8:9], 0
+; GFX6-NEXT: s_mov_b32 s7, 0xf000
; GFX6-NEXT: s_mov_b32 s4, s6
; GFX6-NEXT: s_mov_b32 s5, s6
+; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044
+; GFX6-NEXT: s_mov_b64 s[8:9], 0
; GFX6-NEXT: .LBB13_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_add_f32_e32 v3, v4, v2
-; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v6, v4
; GFX6-NEXT: v_mov_b32_e32 v5, v3
-; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_add_f32_e32 v4, v5, v2
+; GFX6-NEXT: v_mov_b32_e32 v3, v4
+; GFX6-NEXT: v_mov_b32_e32 v4, v5
+; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT: v_mov_b32_e32 v4, v5
; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_cbranch_execnz .LBB13_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
+; GFX6-NEXT: v_mov_b32_e32 v0, v3
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
- %gep = getelementptr float, ptr addrspace(1) %ptr, i64 -512
- %unused = atomicrmw fadd ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst
- ret void
+ %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511
+ %result = atomicrmw fadd ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst, !amdgpu.ignore.denormal.mode !0
+ ret float %result
}
-define float @global_system_atomic_fadd_ret_f32__offset12b_pos__ftz(ptr addrspace(1) %ptr, float %val) #1 {
-; GFX12-LABEL: global_system_atomic_fadd_ret_f32__offset12b_pos__ftz:
+define void @global_agent_atomic_fadd_noret_f32_maybe_remote(ptr addrspace(1) %ptr, float %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fadd_noret_f32_maybe_remote:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: s_wait_expcnt 0x0
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2044
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: .LBB14_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v4, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_add_f32_e32 v3, v4, v2
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_SYS
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB14_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044
+; GFX12-NEXT: s_wait_storecnt 0x0
+; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
-; GFX940-LABEL: global_system_atomic_fadd_ret_f32__offset12b_pos__ftz:
+; GFX940-LABEL: global_agent_atomic_fadd_noret_f32_maybe_remote:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: buffer_wbl2 sc0 sc1
-; GFX940-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:2044 sc0 sc1
+; GFX940-NEXT: buffer_wbl2 sc1
+; GFX940-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044
; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: buffer_inv sc0 sc1
+; GFX940-NEXT: buffer_inv sc1
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: global_system_atomic_fadd_ret_f32__offset12b_pos__ftz:
+; GFX11-LABEL: global_agent_atomic_fadd_noret_f32_maybe_remote:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: .LBB14_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v4, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_add_f32_e32 v3, v4, v2
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 glc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB14_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: v_mov_b32_e32 v0, v3
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
-; GFX10-LABEL: global_system_atomic_fadd_ret_f32__offset12b_pos__ftz:
+; GFX10-LABEL: global_agent_atomic_fadd_noret_f32_maybe_remote:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044
+; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:2044
; GFX10-NEXT: s_mov_b32 s4, 0
; GFX10-NEXT: .LBB14_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v4, v3
; GFX10-NEXT: v_add_f32_e32 v3, v4, v2
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
@@ -2517,207 +2501,147 @@ define float @global_system_atomic_fadd_ret_f32__offset12b_pos__ftz(ptr addrspac
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX10-NEXT: v_mov_b32_e32 v4, v3
; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_cbranch_execnz .LBB14_1
; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: v_mov_b32_e32 v0, v3
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX90A-LABEL: global_system_atomic_fadd_ret_f32__offset12b_pos__ftz:
+; GFX90A-LABEL: global_agent_atomic_fadd_noret_f32_maybe_remote:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044
-; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: .LBB14_1: ; %atomicrmw.start
-; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v5, v3
-; GFX90A-NEXT: v_add_f32_e32 v4, v5, v2
-; GFX90A-NEXT: buffer_wbl2
-; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc
+; GFX90A-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: buffer_invl2
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
-; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: s_cbranch_execnz .LBB14_1
-; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: v_mov_b32_e32 v0, v3
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
-; GFX908-LABEL: global_system_atomic_fadd_ret_f32__offset12b_pos__ftz:
+; GFX908-LABEL: global_agent_atomic_fadd_noret_f32_maybe_remote:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044
-; GFX908-NEXT: s_mov_b64 s[4:5], 0
-; GFX908-NEXT: .LBB14_1: ; %atomicrmw.start
-; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v4, v3
-; GFX908-NEXT: v_add_f32_e32 v3, v4, v2
-; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
+; GFX908-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
-; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT: s_cbranch_execnz .LBB14_1
-; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX908-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX908-NEXT: v_mov_b32_e32 v0, v3
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
-; GFX8-LABEL: global_system_atomic_fadd_ret_f32__offset12b_pos__ftz:
+; GFX8-LABEL: global_agent_atomic_fadd_noret_f32_maybe_remote:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fc, v0
-; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_load_dword v0, v[3:4]
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT: flat_load_dword v4, v[0:1]
; GFX8-NEXT: s_mov_b64 s[4:5], 0
; GFX8-NEXT: .LBB14_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, v0
-; GFX8-NEXT: v_add_f32_e32 v0, v1, v2
-; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
+; GFX8-NEXT: v_add_f32_e32 v3, v4, v2
+; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT: v_mov_b32_e32 v4, v3
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB14_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX7-LABEL: global_system_atomic_fadd_ret_f32__offset12b_pos__ftz:
+; GFX7-LABEL: global_agent_atomic_fadd_noret_f32_maybe_remote:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: s_mov_b32 s6, 0
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
-; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044
+; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:2044
; GFX7-NEXT: s_mov_b64 s[8:9], 0
; GFX7-NEXT: .LBB14_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_add_f32_e32 v3, v4, v2
+; GFX7-NEXT: v_mov_b32_e32 v6, v4
; GFX7-NEXT: v_mov_b32_e32 v5, v3
-; GFX7-NEXT: v_add_f32_e32 v4, v5, v2
-; GFX7-NEXT: v_mov_b32_e32 v3, v4
-; GFX7-NEXT: v_mov_b32_e32 v4, v5
-; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc
+; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2044 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4
; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT: v_mov_b32_e32 v4, v5
; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_cbranch_execnz .LBB14_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
-; GFX7-NEXT: v_mov_b32_e32 v0, v3
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
-; GFX6-LABEL: global_system_atomic_fadd_ret_f32__offset12b_pos__ftz:
+; GFX6-LABEL: global_agent_atomic_fadd_noret_f32_maybe_remote:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: s_mov_b32 s6, 0
; GFX6-NEXT: s_mov_b32 s7, 0xf000
; GFX6-NEXT: s_mov_b32 s4, s6
; GFX6-NEXT: s_mov_b32 s5, s6
-; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044
+; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:2044
; GFX6-NEXT: s_mov_b64 s[8:9], 0
; GFX6-NEXT: .LBB14_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v5, v3
+; GFX6-NEXT: v_add_f32_e32 v3, v4, v2
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_add_f32_e32 v4, v5, v2
-; GFX6-NEXT: v_mov_b32_e32 v3, v4
-; GFX6-NEXT: v_mov_b32_e32 v4, v5
-; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc
-; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX6-NEXT: v_mov_b32_e32 v6, v4
+; GFX6-NEXT: v_mov_b32_e32 v5, v3
+; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2044 glc
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: buffer_wbinvl1
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4
; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT: v_mov_b32_e32 v4, v5
; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_cbranch_execnz .LBB14_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
-; GFX6-NEXT: v_mov_b32_e32 v0, v3
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr float, ptr addrspace(1) %ptr, i64 511
- %result = atomicrmw fadd ptr addrspace(1) %gep, float %val seq_cst
- ret float %result
+ %unused = atomicrmw fadd ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst
+ ret void
}
-define void @global_system_atomic_fadd_noret_f32__offset12b_pos__ftz(ptr addrspace(1) %ptr, float %val) #1 {
-; GFX12-LABEL: global_system_atomic_fadd_noret_f32__offset12b_pos__ftz:
+define void @global_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: s_wait_expcnt 0x0
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v4, v[0:1], off offset:2044
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: .LBB15_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_add_f32_e32 v3, v4, v2
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_SYS
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT: v_mov_b32_e32 v4, v3
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB15_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044
+; GFX12-NEXT: s_wait_storecnt 0x0
+; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
-; GFX940-LABEL: global_system_atomic_fadd_noret_f32__offset12b_pos__ftz:
+; GFX940-LABEL: global_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: buffer_wbl2 sc0 sc1
-; GFX940-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044 sc1
+; GFX940-NEXT: buffer_wbl2 sc1
+; GFX940-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044
; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: buffer_inv sc0 sc1
+; GFX940-NEXT: buffer_inv sc1
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: global_system_atomic_fadd_noret_f32__offset12b_pos__ftz:
+; GFX11-LABEL: global_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v4, v[0:1], off offset:2044
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: .LBB15_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_add_f32_e32 v3, v4, v2
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 glc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT: v_mov_b32_e32 v4, v3
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB15_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
-; GFX10-LABEL: global_system_atomic_fadd_noret_f32__offset12b_pos__ftz:
+; GFX10-LABEL: global_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:2044
@@ -2740,51 +2664,23 @@ define void @global_system_atomic_fadd_noret_f32__offset12b_pos__ftz(ptr addrspa
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX90A-LABEL: global_system_atomic_fadd_noret_f32__offset12b_pos__ftz:
+; GFX90A-LABEL: global_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:2044
-; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: .LBB15_1: ; %atomicrmw.start
-; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_add_f32_e32 v4, v5, v2
-; GFX90A-NEXT: buffer_wbl2
-; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc
+; GFX90A-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: buffer_invl2
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
-; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: v_mov_b32_e32 v5, v3
-; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: s_cbranch_execnz .LBB15_1
-; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
-; GFX908-LABEL: global_system_atomic_fadd_noret_f32__offset12b_pos__ftz:
+; GFX908-LABEL: global_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: global_load_dword v4, v[0:1], off offset:2044
-; GFX908-NEXT: s_mov_b64 s[4:5], 0
-; GFX908-NEXT: .LBB15_1: ; %atomicrmw.start
-; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_add_f32_e32 v3, v4, v2
-; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
+; GFX908-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
-; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT: v_mov_b32_e32 v4, v3
-; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT: s_cbranch_execnz .LBB15_1
-; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX908-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
-; GFX8-LABEL: global_system_atomic_fadd_noret_f32__offset12b_pos__ftz:
+; GFX8-LABEL: global_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0
@@ -2807,7 +2703,7 @@ define void @global_system_atomic_fadd_noret_f32__offset12b_pos__ftz(ptr addrspa
; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX7-LABEL: global_system_atomic_fadd_noret_f32__offset12b_pos__ftz:
+; GFX7-LABEL: global_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: s_mov_b32 s6, 0
@@ -2834,7 +2730,7 @@ define void @global_system_atomic_fadd_noret_f32__offset12b_pos__ftz(ptr addrspa
; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
-; GFX6-LABEL: global_system_atomic_fadd_noret_f32__offset12b_pos__ftz:
+; GFX6-LABEL: global_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: s_mov_b32 s6, 0
@@ -2863,762 +2759,539 @@ define void @global_system_atomic_fadd_noret_f32__offset12b_pos__ftz(ptr addrspa
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr float, ptr addrspace(1) %ptr, i64 511
- %unused = atomicrmw fadd ptr addrspace(1) %gep, float %val seq_cst
+ %unused = atomicrmw fadd ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
ret void
}
-; --------------------------------------------------------------------
-; double
-; --------------------------------------------------------------------
-
-define double @global_agent_atomic_fadd_ret_f64(ptr addrspace(1) %ptr, double %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fadd_ret_f64:
+define void @global_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode(ptr addrspace(1) %ptr, float %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: s_wait_expcnt 0x0
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: .LBB16_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_add_f64_e32 v[4:5], v[6:7], v[2:3]
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off th:TH_ATOMIC_RETURN
-; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044
+; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB16_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
-; GFX940-LABEL: global_agent_atomic_fadd_ret_f64:
+; GFX940-LABEL: global_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: global_atomic_add_f64 v[0:1], v[0:1], v[2:3], off sc0
+; GFX940-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc1
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: global_agent_atomic_fadd_ret_f64:
+; GFX11-LABEL: global_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: .LBB16_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3]
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off glc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB16_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
-; GFX10-LABEL: global_agent_atomic_fadd_ret_f64:
+; GFX10-LABEL: global_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dwordx2 v[4:5], v[0:1], off
+; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:2044
; GFX10-NEXT: s_mov_b32 s4, 0
; GFX10-NEXT: .LBB16_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v7, v5
-; GFX10-NEXT: v_mov_b32_e32 v6, v4
-; GFX10-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3]
+; GFX10-NEXT: v_add_f32_e32 v3, v4, v2
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc
+; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX10-NEXT: v_mov_b32_e32 v4, v3
; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_cbranch_execnz .LBB16_1
; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: v_mov_b32_e32 v0, v4
-; GFX10-NEXT: v_mov_b32_e32 v1, v5
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX90A-LABEL: global_agent_atomic_fadd_ret_f64:
+; GFX90A-LABEL: global_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: global_atomic_add_f64 v[0:1], v[0:1], v[2:3], off glc
+; GFX90A-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
-; GFX908-LABEL: global_agent_atomic_fadd_ret_f64:
+; GFX908-LABEL: global_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off
-; GFX908-NEXT: s_mov_b64 s[4:5], 0
-; GFX908-NEXT: .LBB16_1: ; %atomicrmw.start
-; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v7, v5
-; GFX908-NEXT: v_mov_b32_e32 v6, v4
-; GFX908-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3]
-; GFX908-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc
+; GFX908-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
-; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT: s_cbranch_execnz .LBB16_1
-; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX908-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX908-NEXT: v_mov_b32_e32 v0, v4
-; GFX908-NEXT: v_mov_b32_e32 v1, v5
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
-; GFX8-LABEL: global_agent_atomic_fadd_ret_f64:
+; GFX8-LABEL: global_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: flat_load_dwordx2 v[4:5], v[0:1]
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT: flat_load_dword v4, v[0:1]
; GFX8-NEXT: s_mov_b64 s[4:5], 0
; GFX8-NEXT: .LBB16_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v7, v5
-; GFX8-NEXT: v_mov_b32_e32 v6, v4
-; GFX8-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3]
-; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
+; GFX8-NEXT: v_add_f32_e32 v3, v4, v2
+; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT: v_mov_b32_e32 v4, v3
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB16_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX8-NEXT: v_mov_b32_e32 v0, v4
-; GFX8-NEXT: v_mov_b32_e32 v1, v5
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX7-LABEL: global_agent_atomic_fadd_ret_f64:
+; GFX7-LABEL: global_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: s_mov_b32 s6, 0
-; GFX7-NEXT: v_mov_b32_e32 v7, v1
-; GFX7-NEXT: v_mov_b32_e32 v6, v0
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
-; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[6:7], s[4:7], 0 addr64
-; GFX7-NEXT: v_mov_b32_e32 v5, v3
-; GFX7-NEXT: v_mov_b32_e32 v4, v2
+; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:2044
; GFX7-NEXT: s_mov_b64 s[8:9], 0
; GFX7-NEXT: .LBB16_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v11, v1
-; GFX7-NEXT: v_mov_b32_e32 v10, v0
-; GFX7-NEXT: v_add_f64 v[8:9], v[10:11], v[4:5]
-; GFX7-NEXT: v_mov_b32_e32 v0, v8
-; GFX7-NEXT: v_mov_b32_e32 v1, v9
-; GFX7-NEXT: v_mov_b32_e32 v2, v10
-; GFX7-NEXT: v_mov_b32_e32 v3, v11
-; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[6:7], s[4:7], 0 addr64 glc
+; GFX7-NEXT: v_add_f32_e32 v3, v4, v2
+; GFX7-NEXT: v_mov_b32_e32 v6, v4
+; GFX7-NEXT: v_mov_b32_e32 v5, v3
+; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2044 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11]
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4
; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT: v_mov_b32_e32 v4, v5
; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_cbranch_execnz .LBB16_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
-; GFX6-LABEL: global_agent_atomic_fadd_ret_f64:
+; GFX6-LABEL: global_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: s_mov_b32 s6, 0
-; GFX6-NEXT: v_mov_b32_e32 v7, v1
-; GFX6-NEXT: v_mov_b32_e32 v6, v0
; GFX6-NEXT: s_mov_b32 s7, 0xf000
; GFX6-NEXT: s_mov_b32 s4, s6
; GFX6-NEXT: s_mov_b32 s5, s6
-; GFX6-NEXT: buffer_load_dwordx2 v[0:1], v[6:7], s[4:7], 0 addr64
-; GFX6-NEXT: v_mov_b32_e32 v5, v3
-; GFX6-NEXT: v_mov_b32_e32 v4, v2
+; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:2044
; GFX6-NEXT: s_mov_b64 s[8:9], 0
; GFX6-NEXT: .LBB16_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v11, v1
-; GFX6-NEXT: v_mov_b32_e32 v10, v0
-; GFX6-NEXT: v_add_f64 v[8:9], v[10:11], v[4:5]
+; GFX6-NEXT: v_add_f32_e32 v3, v4, v2
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v0, v8
-; GFX6-NEXT: v_mov_b32_e32 v1, v9
-; GFX6-NEXT: v_mov_b32_e32 v2, v10
-; GFX6-NEXT: v_mov_b32_e32 v3, v11
-; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[6:7], s[4:7], 0 addr64 glc
+; GFX6-NEXT: v_mov_b32_e32 v6, v4
+; GFX6-NEXT: v_mov_b32_e32 v5, v3
+; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2044 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11]
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4
; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT: v_mov_b32_e32 v4, v5
; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_cbranch_execnz .LBB16_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
- %result = atomicrmw fadd ptr addrspace(1) %ptr, double %val syncscope("agent") seq_cst
- ret double %result
+ %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511
+ %unused = atomicrmw fadd ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0
+ ret void
}
-define double @global_agent_atomic_fadd_ret_f64__offset12b_pos(ptr addrspace(1) %ptr, double %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fadd_ret_f64__offset12b_pos:
+define void @global_agent_atomic_fadd_noret_f32_amdgpu_ignore_denormal_mode(ptr addrspace(1) %ptr, float %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fadd_noret_f32_amdgpu_ignore_denormal_mode:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: s_wait_expcnt 0x0
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off offset:2040
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: .LBB17_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_add_f64_e32 v[4:5], v[6:7], v[2:3]
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:2040 th:TH_ATOMIC_RETURN
-; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044
+; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB17_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
-; GFX940-LABEL: global_agent_atomic_fadd_ret_f64__offset12b_pos:
+; GFX940-LABEL: global_agent_atomic_fadd_noret_f32_amdgpu_ignore_denormal_mode:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: global_atomic_add_f64 v[0:1], v[0:1], v[2:3], off offset:2040 sc0
+; GFX940-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc1
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: global_agent_atomic_fadd_ret_f64__offset12b_pos:
+; GFX11-LABEL: global_agent_atomic_fadd_noret_f32_amdgpu_ignore_denormal_mode:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off offset:2040
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: .LBB17_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3]
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:2040 glc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB17_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
-; GFX10-LABEL: global_agent_atomic_fadd_ret_f64__offset12b_pos:
+; GFX10-LABEL: global_agent_atomic_fadd_noret_f32_amdgpu_ignore_denormal_mode:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:2040
+; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:2044
; GFX10-NEXT: s_mov_b32 s4, 0
; GFX10-NEXT: .LBB17_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v7, v5
-; GFX10-NEXT: v_mov_b32_e32 v6, v4
-; GFX10-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3]
+; GFX10-NEXT: v_add_f32_e32 v3, v4, v2
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:2040 glc
+; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX10-NEXT: v_mov_b32_e32 v4, v3
; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_cbranch_execnz .LBB17_1
; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: v_mov_b32_e32 v0, v4
-; GFX10-NEXT: v_mov_b32_e32 v1, v5
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX90A-LABEL: global_agent_atomic_fadd_ret_f64__offset12b_pos:
+; GFX90A-LABEL: global_agent_atomic_fadd_noret_f32_amdgpu_ignore_denormal_mode:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: global_atomic_add_f64 v[0:1], v[0:1], v[2:3], off offset:2040 glc
+; GFX90A-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
-; GFX908-LABEL: global_agent_atomic_fadd_ret_f64__offset12b_pos:
+; GFX908-LABEL: global_agent_atomic_fadd_noret_f32_amdgpu_ignore_denormal_mode:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:2040
-; GFX908-NEXT: s_mov_b64 s[4:5], 0
-; GFX908-NEXT: .LBB17_1: ; %atomicrmw.start
-; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v7, v5
-; GFX908-NEXT: v_mov_b32_e32 v6, v4
-; GFX908-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3]
-; GFX908-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:2040 glc
+; GFX908-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
-; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT: s_cbranch_execnz .LBB17_1
-; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX908-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX908-NEXT: v_mov_b32_e32 v0, v4
-; GFX908-NEXT: v_mov_b32_e32 v1, v5
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
-; GFX8-LABEL: global_agent_atomic_fadd_ret_f64__offset12b_pos:
+; GFX8-LABEL: global_agent_atomic_fadd_noret_f32_amdgpu_ignore_denormal_mode:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7f8, v0
-; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[4:5]
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT: flat_load_dword v4, v[0:1]
; GFX8-NEXT: s_mov_b64 s[4:5], 0
; GFX8-NEXT: .LBB17_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v9, v1
-; GFX8-NEXT: v_mov_b32_e32 v8, v0
-; GFX8-NEXT: v_add_f64 v[6:7], v[8:9], v[2:3]
-; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
+; GFX8-NEXT: v_add_f32_e32 v3, v4, v2
+; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT: v_mov_b32_e32 v4, v3
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB17_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX7-LABEL: global_agent_atomic_fadd_ret_f64__offset12b_pos:
+; GFX7-LABEL: global_agent_atomic_fadd_noret_f32_amdgpu_ignore_denormal_mode:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: s_mov_b32 s6, 0
-; GFX7-NEXT: v_mov_b32_e32 v7, v1
-; GFX7-NEXT: v_mov_b32_e32 v6, v0
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
-; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[6:7], s[4:7], 0 addr64 offset:2040
-; GFX7-NEXT: v_mov_b32_e32 v5, v3
-; GFX7-NEXT: v_mov_b32_e32 v4, v2
+; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:2044
; GFX7-NEXT: s_mov_b64 s[8:9], 0
; GFX7-NEXT: .LBB17_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v11, v1
-; GFX7-NEXT: v_mov_b32_e32 v10, v0
-; GFX7-NEXT: v_add_f64 v[8:9], v[10:11], v[4:5]
-; GFX7-NEXT: v_mov_b32_e32 v0, v8
-; GFX7-NEXT: v_mov_b32_e32 v1, v9
-; GFX7-NEXT: v_mov_b32_e32 v2, v10
-; GFX7-NEXT: v_mov_b32_e32 v3, v11
-; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[6:7], s[4:7], 0 addr64 offset:2040 glc
+; GFX7-NEXT: v_add_f32_e32 v3, v4, v2
+; GFX7-NEXT: v_mov_b32_e32 v6, v4
+; GFX7-NEXT: v_mov_b32_e32 v5, v3
+; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2044 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11]
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4
; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT: v_mov_b32_e32 v4, v5
; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_cbranch_execnz .LBB17_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
-; GFX6-LABEL: global_agent_atomic_fadd_ret_f64__offset12b_pos:
+; GFX6-LABEL: global_agent_atomic_fadd_noret_f32_amdgpu_ignore_denormal_mode:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: s_mov_b32 s6, 0
-; GFX6-NEXT: v_mov_b32_e32 v7, v1
-; GFX6-NEXT: v_mov_b32_e32 v6, v0
; GFX6-NEXT: s_mov_b32 s7, 0xf000
; GFX6-NEXT: s_mov_b32 s4, s6
; GFX6-NEXT: s_mov_b32 s5, s6
-; GFX6-NEXT: buffer_load_dwordx2 v[0:1], v[6:7], s[4:7], 0 addr64 offset:2040
-; GFX6-NEXT: v_mov_b32_e32 v5, v3
-; GFX6-NEXT: v_mov_b32_e32 v4, v2
+; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:2044
; GFX6-NEXT: s_mov_b64 s[8:9], 0
; GFX6-NEXT: .LBB17_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v11, v1
-; GFX6-NEXT: v_mov_b32_e32 v10, v0
-; GFX6-NEXT: v_add_f64 v[8:9], v[10:11], v[4:5]
+; GFX6-NEXT: v_add_f32_e32 v3, v4, v2
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v0, v8
-; GFX6-NEXT: v_mov_b32_e32 v1, v9
-; GFX6-NEXT: v_mov_b32_e32 v2, v10
-; GFX6-NEXT: v_mov_b32_e32 v3, v11
-; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[6:7], s[4:7], 0 addr64 offset:2040 glc
+; GFX6-NEXT: v_mov_b32_e32 v6, v4
+; GFX6-NEXT: v_mov_b32_e32 v5, v3
+; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2044 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11]
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4
; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT: v_mov_b32_e32 v4, v5
; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_cbranch_execnz .LBB17_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
- %gep = getelementptr double, ptr addrspace(1) %ptr, i64 255
- %result = atomicrmw fadd ptr addrspace(1) %gep, double %val syncscope("agent") seq_cst
- ret double %result
+ %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511
+ %unused = atomicrmw fadd ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst, !amdgpu.ignore.denormal.mode !0
+ ret void
}
-define double @global_agent_atomic_fadd_ret_f64__offset12b_neg(ptr addrspace(1) %ptr, double %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fadd_ret_f64__offset12b_neg:
+define float @global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: s_wait_expcnt 0x0
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off offset:-2048
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: .LBB18_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_add_f64_e32 v[4:5], v[6:7], v[2:3]
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:-2048 th:TH_ATOMIC_RETURN
+; GFX12-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB18_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
-; GFX940-LABEL: global_agent_atomic_fadd_ret_f64__offset12b_neg:
+; GFX940-LABEL: global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: global_atomic_add_f64 v[0:1], v[0:1], v[2:3], off offset:-2048 sc0
+; GFX940-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off sc0
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc1
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: global_agent_atomic_fadd_ret_f64__offset12b_neg:
+; GFX11-LABEL: global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off offset:-2048
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: .LBB18_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3]
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:-2048 glc
+; GFX11-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB18_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
-; GFX10-LABEL: global_agent_atomic_fadd_ret_f64__offset12b_neg:
+; GFX10-LABEL: global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:-2048
+; GFX10-NEXT: global_load_dword v3, v[0:1], off
; GFX10-NEXT: s_mov_b32 s4, 0
; GFX10-NEXT: .LBB18_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v7, v5
-; GFX10-NEXT: v_mov_b32_e32 v6, v4
-; GFX10-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3]
+; GFX10-NEXT: v_mov_b32_e32 v4, v3
+; GFX10-NEXT: v_add_f32_e32 v3, v4, v2
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:-2048 glc
+; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_cbranch_execnz .LBB18_1
; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: v_mov_b32_e32 v0, v4
-; GFX10-NEXT: v_mov_b32_e32 v1, v5
+; GFX10-NEXT: v_mov_b32_e32 v0, v3
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX90A-LABEL: global_agent_atomic_fadd_ret_f64__offset12b_neg:
+; GFX90A-LABEL: global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: global_atomic_add_f64 v[0:1], v[0:1], v[2:3], off offset:-2048 glc
+; GFX90A-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
-; GFX908-LABEL: global_agent_atomic_fadd_ret_f64__offset12b_neg:
+; GFX908-LABEL: global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:-2048
+; GFX908-NEXT: global_load_dword v3, v[0:1], off
; GFX908-NEXT: s_mov_b64 s[4:5], 0
; GFX908-NEXT: .LBB18_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v7, v5
-; GFX908-NEXT: v_mov_b32_e32 v6, v4
-; GFX908-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3]
-; GFX908-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:-2048 glc
+; GFX908-NEXT: v_mov_b32_e32 v4, v3
+; GFX908-NEXT: v_add_f32_e32 v3, v4, v2
+; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB18_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX908-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX908-NEXT: v_mov_b32_e32 v0, v4
-; GFX908-NEXT: v_mov_b32_e32 v1, v5
+; GFX908-NEXT: v_mov_b32_e32 v0, v3
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
-; GFX8-LABEL: global_agent_atomic_fadd_ret_f64__offset12b_neg:
+; GFX8-LABEL: global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0xfffff800, v0
-; GFX8-NEXT: v_addc_u32_e32 v5, vcc, -1, v1, vcc
-; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[4:5]
+; GFX8-NEXT: flat_load_dword v3, v[0:1]
; GFX8-NEXT: s_mov_b64 s[4:5], 0
; GFX8-NEXT: .LBB18_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v9, v1
-; GFX8-NEXT: v_mov_b32_e32 v8, v0
-; GFX8-NEXT: v_add_f64 v[6:7], v[8:9], v[2:3]
-; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
+; GFX8-NEXT: v_mov_b32_e32 v4, v3
+; GFX8-NEXT: v_add_f32_e32 v3, v4, v2
+; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB18_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT: v_mov_b32_e32 v0, v3
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX7-LABEL: global_agent_atomic_fadd_ret_f64__offset12b_neg:
+; GFX7-LABEL: global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: s_movk_i32 s4, 0xf800
-; GFX7-NEXT: v_mov_b32_e32 v7, v1
-; GFX7-NEXT: v_mov_b32_e32 v6, v0
-; GFX7-NEXT: s_mov_b32 s5, -1
-; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s6, 0
-; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[6:7], s[4:7], 0 addr64
-; GFX7-NEXT: v_add_i32_e32 v6, vcc, 0xfffff800, v6
-; GFX7-NEXT: v_mov_b32_e32 v5, v3
-; GFX7-NEXT: v_mov_b32_e32 v4, v2
-; GFX7-NEXT: v_addc_u32_e32 v7, vcc, -1, v7, vcc
-; GFX7-NEXT: s_mov_b64 s[8:9], 0
+; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
+; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: s_mov_b64 s[8:9], 0
; GFX7-NEXT: .LBB18_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v11, v1
-; GFX7-NEXT: v_mov_b32_e32 v10, v0
-; GFX7-NEXT: v_add_f64 v[8:9], v[10:11], v[4:5]
-; GFX7-NEXT: v_mov_b32_e32 v0, v8
-; GFX7-NEXT: v_mov_b32_e32 v1, v9
-; GFX7-NEXT: v_mov_b32_e32 v2, v10
-; GFX7-NEXT: v_mov_b32_e32 v3, v11
-; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[6:7], s[4:7], 0 addr64 glc
+; GFX7-NEXT: v_mov_b32_e32 v5, v3
+; GFX7-NEXT: v_add_f32_e32 v4, v5, v2
+; GFX7-NEXT: v_mov_b32_e32 v3, v4
+; GFX7-NEXT: v_mov_b32_e32 v4, v5
+; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11]
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_cbranch_execnz .LBB18_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
+; GFX7-NEXT: v_mov_b32_e32 v0, v3
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
-; GFX6-LABEL: global_agent_atomic_fadd_ret_f64__offset12b_neg:
+; GFX6-LABEL: global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT: s_movk_i32 s4, 0xf800
-; GFX6-NEXT: v_mov_b32_e32 v7, v1
-; GFX6-NEXT: v_mov_b32_e32 v6, v0
-; GFX6-NEXT: s_mov_b32 s5, -1
-; GFX6-NEXT: s_mov_b32 s7, 0xf000
; GFX6-NEXT: s_mov_b32 s6, 0
-; GFX6-NEXT: buffer_load_dwordx2 v[0:1], v[6:7], s[4:7], 0 addr64
-; GFX6-NEXT: v_add_i32_e32 v6, vcc, 0xfffff800, v6
-; GFX6-NEXT: v_mov_b32_e32 v5, v3
-; GFX6-NEXT: v_mov_b32_e32 v4, v2
-; GFX6-NEXT: v_addc_u32_e32 v7, vcc, -1, v7, vcc
-; GFX6-NEXT: s_mov_b64 s[8:9], 0
+; GFX6-NEXT: s_mov_b32 s7, 0xf000
; GFX6-NEXT: s_mov_b32 s4, s6
; GFX6-NEXT: s_mov_b32 s5, s6
+; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
+; GFX6-NEXT: s_mov_b64 s[8:9], 0
; GFX6-NEXT: .LBB18_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v11, v1
-; GFX6-NEXT: v_mov_b32_e32 v10, v0
-; GFX6-NEXT: v_add_f64 v[8:9], v[10:11], v[4:5]
+; GFX6-NEXT: v_mov_b32_e32 v5, v3
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v0, v8
-; GFX6-NEXT: v_mov_b32_e32 v1, v9
-; GFX6-NEXT: v_mov_b32_e32 v2, v10
-; GFX6-NEXT: v_mov_b32_e32 v3, v11
-; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[6:7], s[4:7], 0 addr64 glc
+; GFX6-NEXT: v_add_f32_e32 v4, v5, v2
+; GFX6-NEXT: v_mov_b32_e32 v3, v4
+; GFX6-NEXT: v_mov_b32_e32 v4, v5
+; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11]
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_cbranch_execnz .LBB18_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
+; GFX6-NEXT: v_mov_b32_e32 v0, v3
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
- %gep = getelementptr double, ptr addrspace(1) %ptr, i64 -256
- %result = atomicrmw fadd ptr addrspace(1) %gep, double %val syncscope("agent") seq_cst
- ret double %result
+ %result = atomicrmw fadd ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
+ ret float %result
}
-define void @global_agent_atomic_fadd_noret_f64(ptr addrspace(1) %ptr, double %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fadd_noret_f64:
+define void @global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: s_wait_expcnt 0x0
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b64 v[6:7], v[0:1], off
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: .LBB19_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_add_f64_e32 v[4:5], v[6:7], v[2:3]
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off th:TH_ATOMIC_RETURN
-; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: global_atomic_add_f32 v[0:1], v2, off
+; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
-; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB19_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
-; GFX940-LABEL: global_agent_atomic_fadd_noret_f64:
+; GFX940-LABEL: global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: global_atomic_add_f64 v[0:1], v[2:3], off
+; GFX940-NEXT: global_atomic_add_f32 v[0:1], v2, off
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc1
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: global_agent_atomic_fadd_noret_f64:
+; GFX11-LABEL: global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b64 v[6:7], v[0:1], off
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: .LBB19_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3]
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off glc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: global_atomic_add_f32 v[0:1], v2, off
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
-; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB19_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
-; GFX10-LABEL: global_agent_atomic_fadd_noret_f64:
+; GFX10-LABEL: global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dwordx2 v[6:7], v[0:1], off
+; GFX10-NEXT: global_load_dword v4, v[0:1], off
; GFX10-NEXT: s_mov_b32 s4, 0
; GFX10-NEXT: .LBB19_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3]
+; GFX10-NEXT: v_add_f32_e32 v3, v4, v2
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc
+; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
-; GFX10-NEXT: v_mov_b32_e32 v7, v5
-; GFX10-NEXT: v_mov_b32_e32 v6, v4
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX10-NEXT: v_mov_b32_e32 v4, v3
; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_cbranch_execnz .LBB19_1
@@ -3626,406 +3299,321 @@ define void @global_agent_atomic_fadd_noret_f64(ptr addrspace(1) %ptr, double %v
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX90A-LABEL: global_agent_atomic_fadd_noret_f64:
+; GFX90A-LABEL: global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: global_atomic_add_f64 v[0:1], v[2:3], off
+; GFX90A-NEXT: global_atomic_add_f32 v[0:1], v2, off
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
-; GFX908-LABEL: global_agent_atomic_fadd_noret_f64:
+; GFX908-LABEL: global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: global_load_dwordx2 v[6:7], v[0:1], off
-; GFX908-NEXT: s_mov_b64 s[4:5], 0
-; GFX908-NEXT: .LBB19_1: ; %atomicrmw.start
-; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3]
-; GFX908-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc
+; GFX908-NEXT: global_atomic_add_f32 v[0:1], v2, off
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
-; GFX908-NEXT: v_mov_b32_e32 v7, v5
-; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT: v_mov_b32_e32 v6, v4
-; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT: s_cbranch_execnz .LBB19_1
-; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX908-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
-; GFX8-LABEL: global_agent_atomic_fadd_noret_f64:
+; GFX8-LABEL: global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: flat_load_dwordx2 v[6:7], v[0:1]
+; GFX8-NEXT: flat_load_dword v4, v[0:1]
; GFX8-NEXT: s_mov_b64 s[4:5], 0
; GFX8-NEXT: .LBB19_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3]
-; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
+; GFX8-NEXT: v_add_f32_e32 v3, v4, v2
+; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
-; GFX8-NEXT: v_mov_b32_e32 v7, v5
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT: v_mov_b32_e32 v6, v4
+; GFX8-NEXT: v_mov_b32_e32 v4, v3
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB19_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX7-LABEL: global_agent_atomic_fadd_noret_f64:
+; GFX7-LABEL: global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: s_mov_b32 s6, 0
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
-; GFX7-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64
; GFX7-NEXT: s_mov_b64 s[8:9], 0
; GFX7-NEXT: .LBB19_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3]
-; GFX7-NEXT: v_mov_b32_e32 v11, v7
-; GFX7-NEXT: v_mov_b32_e32 v10, v6
-; GFX7-NEXT: v_mov_b32_e32 v9, v5
-; GFX7-NEXT: v_mov_b32_e32 v8, v4
-; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT: v_add_f32_e32 v3, v4, v2
+; GFX7-NEXT: v_mov_b32_e32 v6, v4
+; GFX7-NEXT: v_mov_b32_e32 v5, v3
+; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7]
-; GFX7-NEXT: v_mov_b32_e32 v6, v8
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4
; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT: v_mov_b32_e32 v7, v9
+; GFX7-NEXT: v_mov_b32_e32 v4, v5
; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_cbranch_execnz .LBB19_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
-; GFX6-LABEL: global_agent_atomic_fadd_noret_f64:
+; GFX6-LABEL: global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: s_mov_b32 s6, 0
; GFX6-NEXT: s_mov_b32 s7, 0xf000
; GFX6-NEXT: s_mov_b32 s4, s6
; GFX6-NEXT: s_mov_b32 s5, s6
-; GFX6-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64
+; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64
; GFX6-NEXT: s_mov_b64 s[8:9], 0
; GFX6-NEXT: .LBB19_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3]
+; GFX6-NEXT: v_add_f32_e32 v3, v4, v2
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v11, v7
-; GFX6-NEXT: v_mov_b32_e32 v10, v6
-; GFX6-NEXT: v_mov_b32_e32 v9, v5
-; GFX6-NEXT: v_mov_b32_e32 v8, v4
-; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT: v_mov_b32_e32 v6, v4
+; GFX6-NEXT: v_mov_b32_e32 v5, v3
+; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7]
-; GFX6-NEXT: v_mov_b32_e32 v6, v8
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4
; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT: v_mov_b32_e32 v7, v9
+; GFX6-NEXT: v_mov_b32_e32 v4, v5
; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_cbranch_execnz .LBB19_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
- %unused = atomicrmw fadd ptr addrspace(1) %ptr, double %val syncscope("agent") seq_cst
+ %unused = atomicrmw fadd ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
ret void
}
-define void @global_agent_atomic_fadd_noret_f64__offset12b_pos(ptr addrspace(1) %ptr, double %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fadd_noret_f64__offset12b_pos:
+define float @global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode(ptr addrspace(1) %ptr, float %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: s_wait_expcnt 0x0
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b64 v[6:7], v[0:1], off offset:2040
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: .LBB20_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_add_f64_e32 v[4:5], v[6:7], v[2:3]
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:2040 th:TH_ATOMIC_RETURN
+; GFX12-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
-; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB20_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
-; GFX940-LABEL: global_agent_atomic_fadd_noret_f64__offset12b_pos:
+; GFX940-LABEL: global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: global_atomic_add_f64 v[0:1], v[2:3], off offset:2040
+; GFX940-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off sc0
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc1
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: global_agent_atomic_fadd_noret_f64__offset12b_pos:
+; GFX11-LABEL: global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b64 v[6:7], v[0:1], off offset:2040
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: .LBB20_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3]
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:2040 glc
+; GFX11-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
-; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB20_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
-; GFX10-LABEL: global_agent_atomic_fadd_noret_f64__offset12b_pos:
+; GFX10-LABEL: global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dwordx2 v[6:7], v[0:1], off offset:2040
+; GFX10-NEXT: global_load_dword v3, v[0:1], off
; GFX10-NEXT: s_mov_b32 s4, 0
; GFX10-NEXT: .LBB20_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3]
+; GFX10-NEXT: v_mov_b32_e32 v4, v3
+; GFX10-NEXT: v_add_f32_e32 v3, v4, v2
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:2040 glc
+; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
-; GFX10-NEXT: v_mov_b32_e32 v7, v5
-; GFX10-NEXT: v_mov_b32_e32 v6, v4
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_cbranch_execnz .LBB20_1
; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT: v_mov_b32_e32 v0, v3
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX90A-LABEL: global_agent_atomic_fadd_noret_f64__offset12b_pos:
+; GFX90A-LABEL: global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: global_atomic_add_f64 v[0:1], v[2:3], off offset:2040
+; GFX90A-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
-; GFX908-LABEL: global_agent_atomic_fadd_noret_f64__offset12b_pos:
+; GFX908-LABEL: global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: global_load_dwordx2 v[6:7], v[0:1], off offset:2040
+; GFX908-NEXT: global_load_dword v3, v[0:1], off
; GFX908-NEXT: s_mov_b64 s[4:5], 0
; GFX908-NEXT: .LBB20_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3]
-; GFX908-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:2040 glc
+; GFX908-NEXT: v_mov_b32_e32 v4, v3
+; GFX908-NEXT: v_add_f32_e32 v3, v4, v2
+; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
-; GFX908-NEXT: v_mov_b32_e32 v7, v5
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT: v_mov_b32_e32 v6, v4
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB20_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX908-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT: v_mov_b32_e32 v0, v3
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
-; GFX8-LABEL: global_agent_atomic_fadd_noret_f64__offset12b_pos:
+; GFX8-LABEL: global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7f8, v0
-; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_load_dwordx2 v[6:7], v[0:1]
+; GFX8-NEXT: flat_load_dword v3, v[0:1]
; GFX8-NEXT: s_mov_b64 s[4:5], 0
; GFX8-NEXT: .LBB20_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3]
-; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
+; GFX8-NEXT: v_mov_b32_e32 v4, v3
+; GFX8-NEXT: v_add_f32_e32 v3, v4, v2
+; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
-; GFX8-NEXT: v_mov_b32_e32 v7, v5
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT: v_mov_b32_e32 v6, v4
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB20_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT: v_mov_b32_e32 v0, v3
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX7-LABEL: global_agent_atomic_fadd_noret_f64__offset12b_pos:
+; GFX7-LABEL: global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: s_mov_b32 s6, 0
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
-; GFX7-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:2040
+; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
; GFX7-NEXT: s_mov_b64 s[8:9], 0
; GFX7-NEXT: .LBB20_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3]
-; GFX7-NEXT: v_mov_b32_e32 v11, v7
-; GFX7-NEXT: v_mov_b32_e32 v10, v6
-; GFX7-NEXT: v_mov_b32_e32 v9, v5
-; GFX7-NEXT: v_mov_b32_e32 v8, v4
-; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 offset:2040 glc
+; GFX7-NEXT: v_mov_b32_e32 v5, v3
+; GFX7-NEXT: v_add_f32_e32 v4, v5, v2
+; GFX7-NEXT: v_mov_b32_e32 v3, v4
+; GFX7-NEXT: v_mov_b32_e32 v4, v5
+; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7]
-; GFX7-NEXT: v_mov_b32_e32 v6, v8
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT: v_mov_b32_e32 v7, v9
; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_cbranch_execnz .LBB20_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
+; GFX7-NEXT: v_mov_b32_e32 v0, v3
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
-; GFX6-LABEL: global_agent_atomic_fadd_noret_f64__offset12b_pos:
+; GFX6-LABEL: global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: s_mov_b32 s6, 0
; GFX6-NEXT: s_mov_b32 s7, 0xf000
; GFX6-NEXT: s_mov_b32 s4, s6
; GFX6-NEXT: s_mov_b32 s5, s6
-; GFX6-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:2040
+; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
; GFX6-NEXT: s_mov_b64 s[8:9], 0
; GFX6-NEXT: .LBB20_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3]
+; GFX6-NEXT: v_mov_b32_e32 v5, v3
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v11, v7
-; GFX6-NEXT: v_mov_b32_e32 v10, v6
-; GFX6-NEXT: v_mov_b32_e32 v9, v5
-; GFX6-NEXT: v_mov_b32_e32 v8, v4
-; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 offset:2040 glc
+; GFX6-NEXT: v_add_f32_e32 v4, v5, v2
+; GFX6-NEXT: v_mov_b32_e32 v3, v4
+; GFX6-NEXT: v_mov_b32_e32 v4, v5
+; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7]
-; GFX6-NEXT: v_mov_b32_e32 v6, v8
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT: v_mov_b32_e32 v7, v9
; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_cbranch_execnz .LBB20_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
+; GFX6-NEXT: v_mov_b32_e32 v0, v3
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
- %gep = getelementptr double, ptr addrspace(1) %ptr, i64 255
- %unused = atomicrmw fadd ptr addrspace(1) %gep, double %val syncscope("agent") seq_cst
- ret void
+ %result = atomicrmw fadd ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0
+ ret float %result
}
-define void @global_agent_atomic_fadd_noret_f64__offset12b_neg(ptr addrspace(1) %ptr, double %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fadd_noret_f64__offset12b_neg:
+define void @global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode(ptr addrspace(1) %ptr, float %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: s_wait_expcnt 0x0
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b64 v[6:7], v[0:1], off offset:-2048
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: .LBB21_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_add_f64_e32 v[4:5], v[6:7], v[2:3]
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:-2048 th:TH_ATOMIC_RETURN
-; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: global_atomic_add_f32 v[0:1], v2, off
+; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
-; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB21_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
-; GFX940-LABEL: global_agent_atomic_fadd_noret_f64__offset12b_neg:
+; GFX940-LABEL: global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: global_atomic_add_f64 v[0:1], v[2:3], off offset:-2048
+; GFX940-NEXT: global_atomic_add_f32 v[0:1], v2, off
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc1
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: global_agent_atomic_fadd_noret_f64__offset12b_neg:
+; GFX11-LABEL: global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b64 v[6:7], v[0:1], off offset:-2048
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: .LBB21_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3]
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:-2048 glc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: global_atomic_add_f32 v[0:1], v2, off
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
-; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB21_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
-; GFX10-LABEL: global_agent_atomic_fadd_noret_f64__offset12b_neg:
+; GFX10-LABEL: global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dwordx2 v[6:7], v[0:1], off offset:-2048
+; GFX10-NEXT: global_load_dword v4, v[0:1], off
; GFX10-NEXT: s_mov_b32 s4, 0
; GFX10-NEXT: .LBB21_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3]
+; GFX10-NEXT: v_add_f32_e32 v3, v4, v2
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:-2048 glc
+; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
-; GFX10-NEXT: v_mov_b32_e32 v7, v5
-; GFX10-NEXT: v_mov_b32_e32 v6, v4
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX10-NEXT: v_mov_b32_e32 v4, v3
; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_cbranch_execnz .LBB21_1
@@ -4033,1971 +3621,994 @@ define void @global_agent_atomic_fadd_noret_f64__offset12b_neg(ptr addrspace(1)
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX90A-LABEL: global_agent_atomic_fadd_noret_f64__offset12b_neg:
+; GFX90A-LABEL: global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: global_atomic_add_f64 v[0:1], v[2:3], off offset:-2048
+; GFX90A-NEXT: global_atomic_add_f32 v[0:1], v2, off
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
-; GFX908-LABEL: global_agent_atomic_fadd_noret_f64__offset12b_neg:
+; GFX908-LABEL: global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: global_load_dwordx2 v[6:7], v[0:1], off offset:-2048
-; GFX908-NEXT: s_mov_b64 s[4:5], 0
-; GFX908-NEXT: .LBB21_1: ; %atomicrmw.start
-; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3]
-; GFX908-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:-2048 glc
+; GFX908-NEXT: global_atomic_add_f32 v[0:1], v2, off
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
-; GFX908-NEXT: v_mov_b32_e32 v7, v5
-; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT: v_mov_b32_e32 v6, v4
-; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT: s_cbranch_execnz .LBB21_1
-; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX908-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
-; GFX8-LABEL: global_agent_atomic_fadd_noret_f64__offset12b_neg:
+; GFX8-LABEL: global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0xfffff800, v0
-; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc
-; GFX8-NEXT: flat_load_dwordx2 v[6:7], v[0:1]
+; GFX8-NEXT: flat_load_dword v4, v[0:1]
; GFX8-NEXT: s_mov_b64 s[4:5], 0
; GFX8-NEXT: .LBB21_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3]
-; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
+; GFX8-NEXT: v_add_f32_e32 v3, v4, v2
+; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
-; GFX8-NEXT: v_mov_b32_e32 v7, v5
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT: v_mov_b32_e32 v6, v4
+; GFX8-NEXT: v_mov_b32_e32 v4, v3
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB21_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX7-LABEL: global_agent_atomic_fadd_noret_f64__offset12b_neg:
+; GFX7-LABEL: global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: s_movk_i32 s4, 0xf800
-; GFX7-NEXT: s_mov_b32 s5, -1
-; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s6, 0
-; GFX7-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0
-; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc
-; GFX7-NEXT: s_mov_b64 s[8:9], 0
+; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
+; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: s_mov_b64 s[8:9], 0
; GFX7-NEXT: .LBB21_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3]
-; GFX7-NEXT: v_mov_b32_e32 v11, v7
-; GFX7-NEXT: v_mov_b32_e32 v10, v6
-; GFX7-NEXT: v_mov_b32_e32 v9, v5
-; GFX7-NEXT: v_mov_b32_e32 v8, v4
-; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT: v_add_f32_e32 v3, v4, v2
+; GFX7-NEXT: v_mov_b32_e32 v6, v4
+; GFX7-NEXT: v_mov_b32_e32 v5, v3
+; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7]
-; GFX7-NEXT: v_mov_b32_e32 v6, v8
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4
; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT: v_mov_b32_e32 v7, v9
+; GFX7-NEXT: v_mov_b32_e32 v4, v5
; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_cbranch_execnz .LBB21_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
-; GFX6-LABEL: global_agent_atomic_fadd_noret_f64__offset12b_neg:
+; GFX6-LABEL: global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT: s_movk_i32 s4, 0xf800
-; GFX6-NEXT: s_mov_b32 s5, -1
-; GFX6-NEXT: s_mov_b32 s7, 0xf000
; GFX6-NEXT: s_mov_b32 s6, 0
-; GFX6-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64
-; GFX6-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0
-; GFX6-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc
-; GFX6-NEXT: s_mov_b64 s[8:9], 0
+; GFX6-NEXT: s_mov_b32 s7, 0xf000
; GFX6-NEXT: s_mov_b32 s4, s6
; GFX6-NEXT: s_mov_b32 s5, s6
+; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64
+; GFX6-NEXT: s_mov_b64 s[8:9], 0
; GFX6-NEXT: .LBB21_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3]
+; GFX6-NEXT: v_add_f32_e32 v3, v4, v2
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v11, v7
-; GFX6-NEXT: v_mov_b32_e32 v10, v6
-; GFX6-NEXT: v_mov_b32_e32 v9, v5
-; GFX6-NEXT: v_mov_b32_e32 v8, v4
-; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT: v_mov_b32_e32 v6, v4
+; GFX6-NEXT: v_mov_b32_e32 v5, v3
+; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7]
-; GFX6-NEXT: v_mov_b32_e32 v6, v8
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4
; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT: v_mov_b32_e32 v7, v9
+; GFX6-NEXT: v_mov_b32_e32 v4, v5
; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_cbranch_execnz .LBB21_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
- %gep = getelementptr double, ptr addrspace(1) %ptr, i64 -256
- %unused = atomicrmw fadd ptr addrspace(1) %gep, double %val syncscope("agent") seq_cst
+ %unused = atomicrmw fadd ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0
ret void
}
-; --------------------------------------------------------------------
-; half
-; --------------------------------------------------------------------
-
-define half @global_agent_atomic_fadd_ret_f16(ptr addrspace(1) %ptr, half %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fadd_ret_f16:
+define float @global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_no_fine_grained_memory:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: s_wait_expcnt 0x0
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v3, v0
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX12-NEXT: global_load_b32 v5, v[0:1], off
-; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3
-; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_not_b32_e32 v4, v4
-; GFX12-NEXT: .LBB22_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v6, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6
-; GFX12-NEXT: v_add_f16_e32 v5, v5, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN
+; GFX12-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB22_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
-; GFX940-LABEL: global_agent_atomic_fadd_ret_f16:
+; GFX940-LABEL: global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_no_fine_grained_memory:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_mov_b32_e32 v3, v0
-; GFX940-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX940-NEXT: global_load_dword v4, v[0:1], off
-; GFX940-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3
-; GFX940-NEXT: s_mov_b32 s0, 0xffff
-; GFX940-NEXT: v_lshlrev_b32_e64 v5, v3, s0
-; GFX940-NEXT: v_not_b32_e32 v5, v5
-; GFX940-NEXT: s_mov_b64 s[0:1], 0
-; GFX940-NEXT: .LBB22_1: ; %atomicrmw.start
-; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: v_mov_b32_e32 v7, v4
-; GFX940-NEXT: v_lshrrev_b32_e32 v4, v3, v7
-; GFX940-NEXT: v_add_f16_e32 v4, v4, v2
-; GFX940-NEXT: v_lshlrev_b32_e32 v4, v3, v4
-; GFX940-NEXT: v_and_or_b32 v6, v7, v5, v4
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: global_atomic_cmpswap v4, v[0:1], v[6:7], off sc0
+; GFX940-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off sc0
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7
-; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT: s_cbranch_execnz .LBB22_1
-; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX940-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v4
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: global_agent_atomic_fadd_ret_f16:
+; GFX11-LABEL: global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_no_fine_grained_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v3, v0
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX11-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX11-NEXT: global_load_b32 v5, v[0:1], off
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3
-; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_not_b32_e32 v4, v4
-; GFX11-NEXT: .LBB22_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v6, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6
-; GFX11-NEXT: v_add_f16_e32 v5, v5, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_and_or_b32 v5, v6, v4, v5
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc
+; GFX11-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB22_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
-; GFX10-LABEL: global_agent_atomic_fadd_ret_f16:
+; GFX10-LABEL: global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v3, v0
+; GFX10-NEXT: global_load_dword v3, v[0:1], off
; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX10-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX10-NEXT: global_load_dword v5, v[0:1], off
-; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3
-; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX10-NEXT: v_not_b32_e32 v4, v4
; GFX10-NEXT: .LBB22_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v6, v5
-; GFX10-NEXT: v_lshrrev_b32_e32 v5, v3, v6
-; GFX10-NEXT: v_add_f16_e32 v5, v5, v2
-; GFX10-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-; GFX10-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX10-NEXT: v_mov_b32_e32 v4, v3
+; GFX10-NEXT: v_add_f32_e32 v3, v4, v2
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: global_atomic_cmpswap v5, v[0:1], v[5:6], off glc
+; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_cbranch_execnz .LBB22_1
; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX10-NEXT: v_mov_b32_e32 v0, v3
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX90A-LABEL: global_agent_atomic_fadd_ret_f16:
+; GFX90A-LABEL: global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v3, v0
-; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX90A-NEXT: global_load_dword v4, v[0:1], off
-; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3
-; GFX90A-NEXT: s_mov_b32 s4, 0xffff
-; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v3, s4
-; GFX90A-NEXT: v_not_b32_e32 v5, v5
-; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: .LBB22_1: ; %atomicrmw.start
-; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v7, v4
-; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v7
-; GFX90A-NEXT: v_add_f16_e32 v4, v4, v2
-; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4
-; GFX90A-NEXT: v_and_or_b32 v6, v7, v5, v4
-; GFX90A-NEXT: global_atomic_cmpswap v4, v[0:1], v[6:7], off glc
+; GFX90A-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7
-; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: s_cbranch_execnz .LBB22_1
-; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v4
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
-; GFX908-LABEL: global_agent_atomic_fadd_ret_f16:
+; GFX908-LABEL: global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_no_fine_grained_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v3, v0
-; GFX908-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX908-NEXT: global_load_dword v4, v[0:1], off
-; GFX908-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX908-NEXT: v_lshlrev_b32_e32 v3, 3, v3
-; GFX908-NEXT: s_mov_b32 s4, 0xffff
-; GFX908-NEXT: v_lshlrev_b32_e64 v5, v3, s4
-; GFX908-NEXT: v_not_b32_e32 v5, v5
+; GFX908-NEXT: global_load_dword v3, v[0:1], off
; GFX908-NEXT: s_mov_b64 s[4:5], 0
; GFX908-NEXT: .LBB22_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v7, v4
-; GFX908-NEXT: v_lshrrev_b32_e32 v4, v3, v7
-; GFX908-NEXT: v_add_f16_e32 v4, v4, v2
-; GFX908-NEXT: v_lshlrev_b32_e32 v4, v3, v4
-; GFX908-NEXT: v_and_or_b32 v6, v7, v5, v4
-; GFX908-NEXT: global_atomic_cmpswap v4, v[0:1], v[6:7], off glc
+; GFX908-NEXT: v_mov_b32_e32 v4, v3
+; GFX908-NEXT: v_add_f32_e32 v3, v4, v2
+; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB22_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX908-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v4
+; GFX908-NEXT: v_mov_b32_e32 v0, v3
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
-; GFX8-LABEL: global_agent_atomic_fadd_ret_f16:
+; GFX8-LABEL: global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_no_fine_grained_memory:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v3, v0
-; GFX8-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX8-NEXT: flat_load_dword v5, v[0:1]
-; GFX8-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX8-NEXT: v_lshlrev_b32_e32 v3, 3, v3
-; GFX8-NEXT: s_mov_b32 s4, 0xffff
-; GFX8-NEXT: v_lshlrev_b32_e64 v4, v3, s4
-; GFX8-NEXT: v_not_b32_e32 v4, v4
+; GFX8-NEXT: flat_load_dword v3, v[0:1]
; GFX8-NEXT: s_mov_b64 s[4:5], 0
; GFX8-NEXT: .LBB22_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v6, v5
-; GFX8-NEXT: v_lshrrev_b32_e32 v5, v3, v6
-; GFX8-NEXT: v_add_f16_e32 v5, v5, v2
-; GFX8-NEXT: v_and_b32_e32 v7, v6, v4
-; GFX8-NEXT: v_lshlrev_b32_e32 v5, v3, v5
-; GFX8-NEXT: v_or_b32_e32 v5, v7, v5
-; GFX8-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
+; GFX8-NEXT: v_mov_b32_e32 v4, v3
+; GFX8-NEXT: v_add_f32_e32 v3, v4, v2
+; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB22_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX8-NEXT: v_mov_b32_e32 v0, v3
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX7-LABEL: global_agent_atomic_fadd_ret_f16:
+; GFX7-LABEL: global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_no_fine_grained_memory:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v3, v0
; GFX7-NEXT: s_mov_b32 s6, 0
-; GFX7-NEXT: v_and_b32_e32 v0, -4, v3
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
-; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX7-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX7-NEXT: v_lshlrev_b32_e32 v5, 3, v3
+; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
; GFX7-NEXT: s_mov_b64 s[8:9], 0
-; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v2
-; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v5
-; GFX7-NEXT: v_not_b32_e32 v7, v2
; GFX7-NEXT: .LBB22_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v2, v5, v4
-; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
-; GFX7-NEXT: v_and_b32_e32 v3, v4, v7
-; GFX7-NEXT: v_add_f32_e32 v2, v2, v6
-; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX7-NEXT: v_lshlrev_b32_e32 v2, v5, v2
-; GFX7-NEXT: v_or_b32_e32 v3, v3, v2
-; GFX7-NEXT: v_mov_b32_e32 v2, v3
+; GFX7-NEXT: v_mov_b32_e32 v5, v3
+; GFX7-NEXT: v_add_f32_e32 v4, v5, v2
; GFX7-NEXT: v_mov_b32_e32 v3, v4
-; GFX7-NEXT: buffer_atomic_cmpswap v[2:3], v[0:1], s[4:7], 0 addr64 glc
-; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v4, v5
+; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT: v_mov_b32_e32 v4, v2
; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_cbranch_execnz .LBB22_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, v5, v2
-; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT: v_mov_b32_e32 v0, v3
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
-; GFX6-LABEL: global_agent_atomic_fadd_ret_f16:
+; GFX6-LABEL: global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_no_fine_grained_memory:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v3, v0
; GFX6-NEXT: s_mov_b32 s6, 0
-; GFX6-NEXT: v_and_b32_e32 v0, -4, v3
; GFX6-NEXT: s_mov_b32 s7, 0xf000
; GFX6-NEXT: s_mov_b32 s4, s6
; GFX6-NEXT: s_mov_b32 s5, s6
-; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64
-; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX6-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX6-NEXT: v_lshlrev_b32_e32 v5, 3, v3
+; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
; GFX6-NEXT: s_mov_b64 s[8:9], 0
-; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v2
-; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v5
-; GFX6-NEXT: v_not_b32_e32 v7, v2
; GFX6-NEXT: .LBB22_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0)
-; GFX6-NEXT: v_lshrrev_b32_e32 v2, v5, v4
-; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2
-; GFX6-NEXT: v_and_b32_e32 v3, v4, v7
-; GFX6-NEXT: v_add_f32_e32 v2, v2, v6
-; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX6-NEXT: v_lshlrev_b32_e32 v2, v5, v2
-; GFX6-NEXT: v_or_b32_e32 v3, v3, v2
-; GFX6-NEXT: v_mov_b32_e32 v2, v3
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v5, v3
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_add_f32_e32 v4, v5, v2
; GFX6-NEXT: v_mov_b32_e32 v3, v4
-; GFX6-NEXT: buffer_atomic_cmpswap v[2:3], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT: v_mov_b32_e32 v4, v5
+; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT: v_mov_b32_e32 v4, v2
; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_cbranch_execnz .LBB22_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
-; GFX6-NEXT: v_lshrrev_b32_e32 v0, v5, v2
-; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX6-NEXT: v_mov_b32_e32 v0, v3
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
- %result = atomicrmw fadd ptr addrspace(1) %ptr, half %val syncscope("agent") seq_cst
- ret half %result
+ %result = atomicrmw fadd ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0, !amdgpu.no.fine.grained.memory !0
+ ret float %result
}
-define half @global_agent_atomic_fadd_ret_f16__offset12b_pos(ptr addrspace(1) %ptr, half %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos:
+define void @global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu_no_fine_grained_memory:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: s_wait_expcnt 0x0
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
-; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX12-NEXT: global_load_b32 v5, v[0:1], off
-; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3
-; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_not_b32_e32 v4, v4
-; GFX12-NEXT: .LBB23_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v6, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6
-; GFX12-NEXT: v_add_f16_e32 v5, v5, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN
-; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: global_atomic_add_f32 v[0:1], v2, off
+; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB23_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
-; GFX940-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos:
+; GFX940-LABEL: global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu_no_fine_grained_memory:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: s_mov_b64 s[0:1], 0x7fe
-; GFX940-NEXT: v_lshl_add_u64 v[6:7], v[0:1], 0, s[0:1]
-; GFX940-NEXT: v_and_b32_e32 v0, -4, v6
-; GFX940-NEXT: v_mov_b32_e32 v1, v7
-; GFX940-NEXT: global_load_dword v4, v[0:1], off
-; GFX940-NEXT: v_and_b32_e32 v3, 3, v6
-; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3
-; GFX940-NEXT: s_mov_b32 s0, 0xffff
-; GFX940-NEXT: v_lshlrev_b32_e64 v5, v3, s0
-; GFX940-NEXT: v_not_b32_e32 v5, v5
-; GFX940-NEXT: s_mov_b64 s[0:1], 0
-; GFX940-NEXT: .LBB23_1: ; %atomicrmw.start
-; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: v_mov_b32_e32 v7, v4
-; GFX940-NEXT: v_lshrrev_b32_e32 v4, v3, v7
-; GFX940-NEXT: v_add_f16_e32 v4, v4, v2
-; GFX940-NEXT: v_lshlrev_b32_e32 v4, v3, v4
-; GFX940-NEXT: v_and_or_b32 v6, v7, v5, v4
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: global_atomic_cmpswap v4, v[0:1], v[6:7], off sc0
+; GFX940-NEXT: global_atomic_add_f32 v[0:1], v2, off
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7
-; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT: s_cbranch_execnz .LBB23_1
-; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX940-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v4
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos:
+; GFX11-LABEL: global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu_no_fine_grained_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX11-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX11-NEXT: global_load_b32 v5, v[0:1], off
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3
-; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_not_b32_e32 v4, v4
-; GFX11-NEXT: .LBB23_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v6, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6
-; GFX11-NEXT: v_add_f16_e32 v5, v5, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_and_or_b32 v5, v6, v4, v5
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: global_atomic_add_f32 v[0:1], v2, off
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB23_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
-; GFX10-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos:
+; GFX10-LABEL: global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
-; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX10-NEXT: global_load_dword v4, v[0:1], off
; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX10-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX10-NEXT: global_load_dword v5, v[0:1], off
-; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3
-; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX10-NEXT: v_not_b32_e32 v4, v4
; GFX10-NEXT: .LBB23_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v6, v5
-; GFX10-NEXT: v_lshrrev_b32_e32 v5, v3, v6
-; GFX10-NEXT: v_add_f16_e32 v5, v5, v2
-; GFX10-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-; GFX10-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX10-NEXT: v_add_f32_e32 v3, v4, v2
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: global_atomic_cmpswap v5, v[0:1], v[5:6], off glc
+; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX10-NEXT: v_mov_b32_e32 v4, v3
; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_cbranch_execnz .LBB23_1
; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: v_lshrrev_b32_e32 v0, v3, v5
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX90A-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos:
+; GFX90A-LABEL: global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0
-; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX90A-NEXT: global_load_dword v4, v[0:1], off
-; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3
-; GFX90A-NEXT: s_mov_b32 s4, 0xffff
-; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v3, s4
-; GFX90A-NEXT: v_not_b32_e32 v5, v5
-; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: .LBB23_1: ; %atomicrmw.start
-; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v7, v4
-; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v7
-; GFX90A-NEXT: v_add_f16_e32 v4, v4, v2
-; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4
-; GFX90A-NEXT: v_and_or_b32 v6, v7, v5, v4
-; GFX90A-NEXT: global_atomic_cmpswap v4, v[0:1], v[6:7], off glc
+; GFX90A-NEXT: global_atomic_add_f32 v[0:1], v2, off
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7
-; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: s_cbranch_execnz .LBB23_1
-; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v4
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
-; GFX908-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos:
+; GFX908-LABEL: global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu_no_fine_grained_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0
-; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX908-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX908-NEXT: global_load_dword v4, v[0:1], off
-; GFX908-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX908-NEXT: v_lshlrev_b32_e32 v3, 3, v3
-; GFX908-NEXT: s_mov_b32 s4, 0xffff
-; GFX908-NEXT: v_lshlrev_b32_e64 v5, v3, s4
-; GFX908-NEXT: v_not_b32_e32 v5, v5
-; GFX908-NEXT: s_mov_b64 s[4:5], 0
-; GFX908-NEXT: .LBB23_1: ; %atomicrmw.start
-; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v7, v4
-; GFX908-NEXT: v_lshrrev_b32_e32 v4, v3, v7
-; GFX908-NEXT: v_add_f16_e32 v4, v4, v2
-; GFX908-NEXT: v_lshlrev_b32_e32 v4, v3, v4
-; GFX908-NEXT: v_and_or_b32 v6, v7, v5, v4
-; GFX908-NEXT: global_atomic_cmpswap v4, v[0:1], v[6:7], off glc
+; GFX908-NEXT: global_atomic_add_f32 v[0:1], v2, off
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7
-; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT: s_cbranch_execnz .LBB23_1
-; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX908-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v4
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
-; GFX8-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos:
+; GFX8-LABEL: global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu_no_fine_grained_memory:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fe, v0
-; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX8-NEXT: flat_load_dword v5, v[0:1]
-; GFX8-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX8-NEXT: v_lshlrev_b32_e32 v3, 3, v3
-; GFX8-NEXT: s_mov_b32 s4, 0xffff
-; GFX8-NEXT: v_lshlrev_b32_e64 v4, v3, s4
-; GFX8-NEXT: v_not_b32_e32 v4, v4
+; GFX8-NEXT: flat_load_dword v4, v[0:1]
; GFX8-NEXT: s_mov_b64 s[4:5], 0
; GFX8-NEXT: .LBB23_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v6, v5
-; GFX8-NEXT: v_lshrrev_b32_e32 v5, v3, v6
-; GFX8-NEXT: v_add_f16_e32 v5, v5, v2
-; GFX8-NEXT: v_and_b32_e32 v7, v6, v4
-; GFX8-NEXT: v_lshlrev_b32_e32 v5, v3, v5
-; GFX8-NEXT: v_or_b32_e32 v5, v7, v5
-; GFX8-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
+; GFX8-NEXT: v_add_f32_e32 v3, v4, v2
+; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT: v_mov_b32_e32 v4, v3
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB23_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, v3, v5
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX7-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos:
+; GFX7-LABEL: global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu_no_fine_grained_memory:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0
; GFX7-NEXT: s_mov_b32 s6, 0
-; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX7-NEXT: v_and_b32_e32 v0, -4, v4
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
-; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX7-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX7-NEXT: v_lshlrev_b32_e32 v6, 3, v4
+; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64
; GFX7-NEXT: s_mov_b64 s[8:9], 0
-; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v2
-; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v6
-; GFX7-NEXT: v_not_b32_e32 v8, v2
; GFX7-NEXT: .LBB23_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v2, v6, v3
-; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
-; GFX7-NEXT: v_and_b32_e32 v4, v3, v8
-; GFX7-NEXT: v_add_f32_e32 v2, v2, v7
-; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX7-NEXT: v_lshlrev_b32_e32 v2, v6, v2
-; GFX7-NEXT: v_or_b32_e32 v2, v4, v2
+; GFX7-NEXT: v_add_f32_e32 v3, v4, v2
+; GFX7-NEXT: v_mov_b32_e32 v6, v4
; GFX7-NEXT: v_mov_b32_e32 v5, v3
-; GFX7-NEXT: v_mov_b32_e32 v4, v2
-; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4
; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT: v_mov_b32_e32 v3, v4
+; GFX7-NEXT: v_mov_b32_e32 v4, v5
; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_cbranch_execnz .LBB23_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, v6, v4
-; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
-; GFX6-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos:
+; GFX6-LABEL: global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu_no_fine_grained_memory:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0
; GFX6-NEXT: s_mov_b32 s6, 0
-; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX6-NEXT: v_and_b32_e32 v0, -4, v4
; GFX6-NEXT: s_mov_b32 s7, 0xf000
; GFX6-NEXT: s_mov_b32 s4, s6
; GFX6-NEXT: s_mov_b32 s5, s6
-; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX6-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX6-NEXT: v_lshlrev_b32_e32 v6, 3, v4
+; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64
; GFX6-NEXT: s_mov_b64 s[8:9], 0
-; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v2
-; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v6
-; GFX6-NEXT: v_not_b32_e32 v8, v2
; GFX6-NEXT: .LBB23_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_lshrrev_b32_e32 v2, v6, v3
-; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GFX6-NEXT: v_add_f32_e32 v3, v4, v2
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v4, v3, v8
-; GFX6-NEXT: v_add_f32_e32 v2, v2, v7
-; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX6-NEXT: v_lshlrev_b32_e32 v2, v6, v2
-; GFX6-NEXT: v_or_b32_e32 v2, v4, v2
+; GFX6-NEXT: v_mov_b32_e32 v6, v4
; GFX6-NEXT: v_mov_b32_e32 v5, v3
-; GFX6-NEXT: v_mov_b32_e32 v4, v2
-; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4
; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT: v_mov_b32_e32 v3, v4
+; GFX6-NEXT: v_mov_b32_e32 v4, v5
; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_cbranch_execnz .LBB23_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
-; GFX6-NEXT: v_lshrrev_b32_e32 v0, v6, v4
-; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
- %gep = getelementptr half, ptr addrspace(1) %ptr, i64 1023
- %result = atomicrmw fadd ptr addrspace(1) %gep, half %val syncscope("agent") seq_cst
- ret half %result
+ %unused = atomicrmw fadd ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0, !amdgpu.no.fine.grained.memory !0
+ ret void
}
-define half @global_agent_atomic_fadd_ret_f16__offset12b_neg(ptr addrspace(1) %ptr, half %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_neg:
+; --------------------------------------------------------------------
+; float with ftz/daz
+; --------------------------------------------------------------------
+
+define float @global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #1 {
+; GFX12-LABEL: global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_fine_grained_memory:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: s_wait_expcnt 0x0
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
-; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX12-NEXT: global_load_b32 v5, v[0:1], off
-; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3
-; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_not_b32_e32 v4, v4
-; GFX12-NEXT: .LBB24_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v6, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6
-; GFX12-NEXT: v_add_f16_e32 v5, v5, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN
+; GFX12-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB24_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
-; GFX940-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_neg:
+; GFX940-LABEL: global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_fine_grained_memory:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: s_movk_i32 s0, 0xf800
-; GFX940-NEXT: s_mov_b32 s1, -1
-; GFX940-NEXT: v_lshl_add_u64 v[6:7], v[0:1], 0, s[0:1]
-; GFX940-NEXT: v_and_b32_e32 v0, -4, v6
-; GFX940-NEXT: v_mov_b32_e32 v1, v7
-; GFX940-NEXT: global_load_dword v4, v[0:1], off
-; GFX940-NEXT: v_and_b32_e32 v3, 3, v6
-; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3
-; GFX940-NEXT: s_mov_b32 s0, 0xffff
-; GFX940-NEXT: v_lshlrev_b32_e64 v5, v3, s0
-; GFX940-NEXT: v_not_b32_e32 v5, v5
-; GFX940-NEXT: s_mov_b64 s[0:1], 0
-; GFX940-NEXT: .LBB24_1: ; %atomicrmw.start
-; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: v_mov_b32_e32 v7, v4
-; GFX940-NEXT: v_lshrrev_b32_e32 v4, v3, v7
-; GFX940-NEXT: v_add_f16_e32 v4, v4, v2
-; GFX940-NEXT: v_lshlrev_b32_e32 v4, v3, v4
-; GFX940-NEXT: v_and_or_b32 v6, v7, v5, v4
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: global_atomic_cmpswap v4, v[0:1], v[6:7], off sc0
+; GFX940-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off sc0
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7
-; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT: s_cbranch_execnz .LBB24_1
-; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX940-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v4
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_neg:
+; GFX11-LABEL: global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_fine_grained_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX11-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX11-NEXT: global_load_b32 v5, v[0:1], off
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3
-; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_not_b32_e32 v4, v4
-; GFX11-NEXT: .LBB24_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v6, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6
-; GFX11-NEXT: v_add_f16_e32 v5, v5, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_and_or_b32 v5, v6, v4, v5
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc
+; GFX11-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB24_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
-; GFX10-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_neg:
+; GFX10-LABEL: global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
-; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX10-NEXT: global_load_dword v3, v[0:1], off
; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX10-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX10-NEXT: global_load_dword v5, v[0:1], off
-; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3
-; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX10-NEXT: v_not_b32_e32 v4, v4
; GFX10-NEXT: .LBB24_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v6, v5
-; GFX10-NEXT: v_lshrrev_b32_e32 v5, v3, v6
-; GFX10-NEXT: v_add_f16_e32 v5, v5, v2
-; GFX10-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-; GFX10-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX10-NEXT: v_mov_b32_e32 v4, v3
+; GFX10-NEXT: v_add_f32_e32 v3, v4, v2
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: global_atomic_cmpswap v5, v[0:1], v[5:6], off glc
+; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_cbranch_execnz .LBB24_1
; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX10-NEXT: v_mov_b32_e32 v0, v3
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX90A-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_neg:
+; GFX90A-LABEL: global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
-; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
-; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX90A-NEXT: global_load_dword v4, v[0:1], off
-; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3
-; GFX90A-NEXT: s_mov_b32 s4, 0xffff
-; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v3, s4
-; GFX90A-NEXT: v_not_b32_e32 v5, v5
-; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: .LBB24_1: ; %atomicrmw.start
-; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v7, v4
-; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v7
-; GFX90A-NEXT: v_add_f16_e32 v4, v4, v2
-; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4
-; GFX90A-NEXT: v_and_or_b32 v6, v7, v5, v4
-; GFX90A-NEXT: global_atomic_cmpswap v4, v[0:1], v[6:7], off glc
+; GFX90A-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7
-; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: s_cbranch_execnz .LBB24_1
-; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v4
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
-; GFX908-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_neg:
+; GFX908-LABEL: global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_fine_grained_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
-; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
-; GFX908-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX908-NEXT: global_load_dword v4, v[0:1], off
-; GFX908-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX908-NEXT: v_lshlrev_b32_e32 v3, 3, v3
-; GFX908-NEXT: s_mov_b32 s4, 0xffff
-; GFX908-NEXT: v_lshlrev_b32_e64 v5, v3, s4
-; GFX908-NEXT: v_not_b32_e32 v5, v5
+; GFX908-NEXT: global_load_dword v3, v[0:1], off
; GFX908-NEXT: s_mov_b64 s[4:5], 0
; GFX908-NEXT: .LBB24_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v7, v4
-; GFX908-NEXT: v_lshrrev_b32_e32 v4, v3, v7
-; GFX908-NEXT: v_add_f16_e32 v4, v4, v2
-; GFX908-NEXT: v_lshlrev_b32_e32 v4, v3, v4
-; GFX908-NEXT: v_and_or_b32 v6, v7, v5, v4
-; GFX908-NEXT: global_atomic_cmpswap v4, v[0:1], v[6:7], off glc
+; GFX908-NEXT: v_mov_b32_e32 v4, v3
+; GFX908-NEXT: v_add_f32_e32 v3, v4, v2
+; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB24_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX908-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v4
+; GFX908-NEXT: v_mov_b32_e32 v0, v3
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
-; GFX8-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_neg:
+; GFX8-LABEL: global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_fine_grained_memory:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0xfffff800, v0
-; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc
-; GFX8-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX8-NEXT: flat_load_dword v5, v[0:1]
-; GFX8-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX8-NEXT: v_lshlrev_b32_e32 v3, 3, v3
-; GFX8-NEXT: s_mov_b32 s4, 0xffff
-; GFX8-NEXT: v_lshlrev_b32_e64 v4, v3, s4
-; GFX8-NEXT: v_not_b32_e32 v4, v4
+; GFX8-NEXT: flat_load_dword v3, v[0:1]
; GFX8-NEXT: s_mov_b64 s[4:5], 0
; GFX8-NEXT: .LBB24_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v6, v5
-; GFX8-NEXT: v_lshrrev_b32_e32 v5, v3, v6
-; GFX8-NEXT: v_add_f16_e32 v5, v5, v2
-; GFX8-NEXT: v_and_b32_e32 v7, v6, v4
-; GFX8-NEXT: v_lshlrev_b32_e32 v5, v3, v5
-; GFX8-NEXT: v_or_b32_e32 v5, v7, v5
-; GFX8-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
+; GFX8-NEXT: v_mov_b32_e32 v4, v3
+; GFX8-NEXT: v_add_f32_e32 v3, v4, v2
+; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB24_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX8-NEXT: v_mov_b32_e32 v0, v3
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX7-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_neg:
+; GFX7-LABEL: global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_fine_grained_memory:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0xfffff800, v0
; GFX7-NEXT: s_mov_b32 s6, 0
-; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc
-; GFX7-NEXT: v_and_b32_e32 v0, -4, v4
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX7-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX7-NEXT: v_lshlrev_b32_e32 v6, 3, v4
; GFX7-NEXT: s_mov_b64 s[8:9], 0
-; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v2
-; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v6
-; GFX7-NEXT: v_not_b32_e32 v8, v2
; GFX7-NEXT: .LBB24_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v2, v6, v3
-; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
-; GFX7-NEXT: v_and_b32_e32 v4, v3, v8
-; GFX7-NEXT: v_add_f32_e32 v2, v2, v7
-; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX7-NEXT: v_lshlrev_b32_e32 v2, v6, v2
-; GFX7-NEXT: v_or_b32_e32 v2, v4, v2
; GFX7-NEXT: v_mov_b32_e32 v5, v3
-; GFX7-NEXT: v_mov_b32_e32 v4, v2
-; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT: v_add_f32_e32 v4, v5, v2
+; GFX7-NEXT: v_mov_b32_e32 v3, v4
+; GFX7-NEXT: v_mov_b32_e32 v4, v5
+; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT: v_mov_b32_e32 v3, v4
; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_cbranch_execnz .LBB24_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, v6, v4
-; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT: v_mov_b32_e32 v0, v3
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
-; GFX6-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_neg:
+; GFX6-LABEL: global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_fine_grained_memory:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0xfffff800, v0
; GFX6-NEXT: s_mov_b32 s6, 0
-; GFX6-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc
-; GFX6-NEXT: v_and_b32_e32 v0, -4, v4
; GFX6-NEXT: s_mov_b32 s7, 0xf000
; GFX6-NEXT: s_mov_b32 s4, s6
; GFX6-NEXT: s_mov_b32 s5, s6
; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX6-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX6-NEXT: v_lshlrev_b32_e32 v6, 3, v4
; GFX6-NEXT: s_mov_b64 s[8:9], 0
-; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v2
-; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v6
-; GFX6-NEXT: v_not_b32_e32 v8, v2
; GFX6-NEXT: .LBB24_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_lshrrev_b32_e32 v2, v6, v3
-; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2
-; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v4, v3, v8
-; GFX6-NEXT: v_add_f32_e32 v2, v2, v7
-; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX6-NEXT: v_lshlrev_b32_e32 v2, v6, v2
-; GFX6-NEXT: v_or_b32_e32 v2, v4, v2
; GFX6-NEXT: v_mov_b32_e32 v5, v3
-; GFX6-NEXT: v_mov_b32_e32 v4, v2
-; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_add_f32_e32 v4, v5, v2
+; GFX6-NEXT: v_mov_b32_e32 v3, v4
+; GFX6-NEXT: v_mov_b32_e32 v4, v5
+; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT: v_mov_b32_e32 v3, v4
; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_cbranch_execnz .LBB24_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
-; GFX6-NEXT: v_lshrrev_b32_e32 v0, v6, v4
-; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX6-NEXT: v_mov_b32_e32 v0, v3
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
- %gep = getelementptr half, ptr addrspace(1) %ptr, i64 -1024
- %result = atomicrmw fadd ptr addrspace(1) %gep, half %val syncscope("agent") seq_cst
- ret half %result
- }
+ %result = atomicrmw fadd ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+ ret float %result
+}
-define void @global_agent_atomic_fadd_noret_f16(ptr addrspace(1) %ptr, half %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fadd_noret_f16:
+define float @global_agent_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #1 {
+; GFX12-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: s_wait_expcnt 0x0
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v3, v0
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX12-NEXT: global_load_b32 v4, v[0:1], off
-; GFX12-NEXT: v_lshlrev_b32_e32 v5, 3, v3
-; GFX12-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_not_b32_e32 v6, v3
-; GFX12-NEXT: .LBB25_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_lshrrev_b32_e32 v3, v5, v4
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_add_f16_e32 v3, v3, v2
-; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3
-; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN
+; GFX12-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT: v_mov_b32_e32 v4, v3
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB25_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
-; GFX940-LABEL: global_agent_atomic_fadd_noret_f16:
+; GFX940-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_mov_b32_e32 v3, v0
-; GFX940-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX940-NEXT: global_load_dword v5, v[0:1], off
-; GFX940-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3
-; GFX940-NEXT: s_mov_b32 s0, 0xffff
-; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0
-; GFX940-NEXT: v_not_b32_e32 v6, v4
-; GFX940-NEXT: s_mov_b64 s[0:1], 0
-; GFX940-NEXT: .LBB25_1: ; %atomicrmw.start
-; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: v_lshrrev_b32_e32 v4, v3, v5
-; GFX940-NEXT: v_add_f16_e32 v4, v4, v2
-; GFX940-NEXT: v_lshlrev_b32_e32 v4, v3, v4
-; GFX940-NEXT: v_and_or_b32 v4, v5, v6, v4
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off sc0
+; GFX940-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:2044 sc0
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5
-; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT: v_mov_b32_e32 v5, v4
-; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT: s_cbranch_execnz .LBB25_1
-; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX940-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: global_agent_atomic_fadd_noret_f16:
+; GFX11-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v3, v0
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX11-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX11-NEXT: global_load_b32 v4, v[0:1], off
-; GFX11-NEXT: v_lshlrev_b32_e32 v5, 3, v3
-; GFX11-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_not_b32_e32 v6, v3
-; GFX11-NEXT: .LBB25_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, v5, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_f16_e32 v3, v3, v2
-; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, v5, v3
-; GFX11-NEXT: v_and_or_b32 v3, v4, v6, v3
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc
+; GFX11-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:2044 glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT: v_mov_b32_e32 v4, v3
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB25_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
-; GFX10-LABEL: global_agent_atomic_fadd_noret_f16:
+; GFX10-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v3, v0
+; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044
; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX10-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX10-NEXT: global_load_dword v4, v[0:1], off
-; GFX10-NEXT: v_lshlrev_b32_e32 v5, 3, v3
-; GFX10-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
-; GFX10-NEXT: v_not_b32_e32 v6, v3
; GFX10-NEXT: .LBB25_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_lshrrev_b32_e32 v3, v5, v4
-; GFX10-NEXT: v_add_f16_e32 v3, v3, v2
-; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-; GFX10-NEXT: v_and_or_b32 v3, v4, v6, v3
+; GFX10-NEXT: v_mov_b32_e32 v4, v3
+; GFX10-NEXT: v_add_f32_e32 v3, v4, v2
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
+; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX10-NEXT: v_mov_b32_e32 v4, v3
; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_cbranch_execnz .LBB25_1
; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT: v_mov_b32_e32 v0, v3
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX90A-LABEL: global_agent_atomic_fadd_noret_f16:
+; GFX90A-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v3, v0
-; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX90A-NEXT: global_load_dword v5, v[0:1], off
-; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3
-; GFX90A-NEXT: s_mov_b32 s4, 0xffff
-; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4
-; GFX90A-NEXT: v_not_b32_e32 v6, v4
-; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: .LBB25_1: ; %atomicrmw.start
-; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v5
-; GFX90A-NEXT: v_add_f16_e32 v4, v4, v2
-; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4
-; GFX90A-NEXT: v_and_or_b32 v4, v5, v6, v4
-; GFX90A-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off glc
+; GFX90A-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:2044 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5
-; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: v_mov_b32_e32 v5, v4
-; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: s_cbranch_execnz .LBB25_1
-; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
-; GFX908-LABEL: global_agent_atomic_fadd_noret_f16:
+; GFX908-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v3, v0
-; GFX908-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX908-NEXT: global_load_dword v4, v[0:1], off
-; GFX908-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX908-NEXT: v_lshlrev_b32_e32 v5, 3, v3
-; GFX908-NEXT: s_mov_b32 s4, 0xffff
-; GFX908-NEXT: v_lshlrev_b32_e64 v3, v5, s4
-; GFX908-NEXT: v_not_b32_e32 v6, v3
+; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044
; GFX908-NEXT: s_mov_b64 s[4:5], 0
; GFX908-NEXT: .LBB25_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_lshrrev_b32_e32 v3, v5, v4
-; GFX908-NEXT: v_add_f16_e32 v3, v3, v2
-; GFX908-NEXT: v_lshlrev_b32_e32 v3, v5, v3
-; GFX908-NEXT: v_and_or_b32 v3, v4, v6, v3
-; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
+; GFX908-NEXT: v_mov_b32_e32 v4, v3
+; GFX908-NEXT: v_add_f32_e32 v3, v4, v2
+; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT: v_mov_b32_e32 v4, v3
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB25_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX908-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT: v_mov_b32_e32 v0, v3
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
-; GFX8-LABEL: global_agent_atomic_fadd_noret_f16:
+; GFX8-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v3, v0
-; GFX8-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX8-NEXT: flat_load_dword v4, v[0:1]
-; GFX8-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX8-NEXT: v_lshlrev_b32_e32 v5, 3, v3
-; GFX8-NEXT: s_mov_b32 s4, 0xffff
-; GFX8-NEXT: v_lshlrev_b32_e64 v3, v5, s4
-; GFX8-NEXT: v_not_b32_e32 v6, v3
+; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fc, v0
+; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc
+; GFX8-NEXT: flat_load_dword v0, v[3:4]
; GFX8-NEXT: s_mov_b64 s[4:5], 0
; GFX8-NEXT: .LBB25_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_lshrrev_b32_e32 v3, v5, v4
-; GFX8-NEXT: v_add_f16_e32 v3, v3, v2
-; GFX8-NEXT: v_and_b32_e32 v7, v4, v6
-; GFX8-NEXT: v_lshlrev_b32_e32 v3, v5, v3
-; GFX8-NEXT: v_or_b32_e32 v3, v7, v3
-; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT: v_mov_b32_e32 v1, v0
+; GFX8-NEXT: v_add_f32_e32 v0, v1, v2
+; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT: v_mov_b32_e32 v4, v3
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB25_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX7-LABEL: global_agent_atomic_fadd_noret_f16:
+; GFX7-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v3, v0
; GFX7-NEXT: s_mov_b32 s6, 0
-; GFX7-NEXT: v_and_b32_e32 v0, -4, v3
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
-; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v2
-; GFX7-NEXT: v_and_b32_e32 v2, 3, v3
-; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2
-; GFX7-NEXT: v_lshl_b32_e32 v3, 0xffff, v2
-; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5
-; GFX7-NEXT: v_not_b32_e32 v6, v3
+; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044
; GFX7-NEXT: s_mov_b64 s[8:9], 0
; GFX7-NEXT: .LBB25_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v3, v2, v4
-; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3
-; GFX7-NEXT: v_and_b32_e32 v7, v4, v6
-; GFX7-NEXT: v_add_f32_e32 v3, v3, v5
-; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3
-; GFX7-NEXT: v_lshlrev_b32_e32 v3, v2, v3
-; GFX7-NEXT: v_or_b32_e32 v3, v7, v3
-; GFX7-NEXT: v_mov_b32_e32 v8, v4
-; GFX7-NEXT: v_mov_b32_e32 v7, v3
-; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT: v_mov_b32_e32 v5, v3
+; GFX7-NEXT: v_add_f32_e32 v4, v5, v2
+; GFX7-NEXT: v_mov_b32_e32 v3, v4
+; GFX7-NEXT: v_mov_b32_e32 v4, v5
+; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v4
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT: v_mov_b32_e32 v4, v7
; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_cbranch_execnz .LBB25_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
+; GFX7-NEXT: v_mov_b32_e32 v0, v3
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
-; GFX6-LABEL: global_agent_atomic_fadd_noret_f16:
+; GFX6-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v3, v0
; GFX6-NEXT: s_mov_b32 s6, 0
-; GFX6-NEXT: v_and_b32_e32 v0, -4, v3
; GFX6-NEXT: s_mov_b32 s7, 0xf000
; GFX6-NEXT: s_mov_b32 s4, s6
; GFX6-NEXT: s_mov_b32 s5, s6
-; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64
-; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v2
-; GFX6-NEXT: v_and_b32_e32 v2, 3, v3
-; GFX6-NEXT: v_lshlrev_b32_e32 v2, 3, v2
-; GFX6-NEXT: v_lshl_b32_e32 v3, 0xffff, v2
-; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5
-; GFX6-NEXT: v_not_b32_e32 v6, v3
+; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044
; GFX6-NEXT: s_mov_b64 s[8:9], 0
; GFX6-NEXT: .LBB25_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_lshrrev_b32_e32 v3, v2, v4
-; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3
+; GFX6-NEXT: v_mov_b32_e32 v5, v3
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v7, v4, v6
-; GFX6-NEXT: v_add_f32_e32 v3, v3, v5
-; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3
-; GFX6-NEXT: v_lshlrev_b32_e32 v3, v2, v3
-; GFX6-NEXT: v_or_b32_e32 v3, v7, v3
-; GFX6-NEXT: v_mov_b32_e32 v8, v4
-; GFX6-NEXT: v_mov_b32_e32 v7, v3
-; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT: v_add_f32_e32 v4, v5, v2
+; GFX6-NEXT: v_mov_b32_e32 v3, v4
+; GFX6-NEXT: v_mov_b32_e32 v4, v5
+; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v4
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT: v_mov_b32_e32 v4, v7
; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_cbranch_execnz .LBB25_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
+; GFX6-NEXT: v_mov_b32_e32 v0, v3
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
- %unused = atomicrmw fadd ptr addrspace(1) %ptr, half %val syncscope("agent") seq_cst
- ret void
+ %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511
+ %result = atomicrmw fadd ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+ ret float %result
}
-define void @global_agent_atomic_fadd_noret_f16__offset12b_pos(ptr addrspace(1) %ptr, half %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_pos:
+define float @global_agent_atomic_fadd_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #1 {
+; GFX12-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: s_wait_expcnt 0x0
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
-; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX12-NEXT: global_load_b32 v4, v[0:1], off
-; GFX12-NEXT: v_lshlrev_b32_e32 v5, 3, v3
-; GFX12-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_not_b32_e32 v6, v3
-; GFX12-NEXT: .LBB26_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_lshrrev_b32_e32 v3, v5, v4
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_add_f16_e32 v3, v3, v2
-; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3
-; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN
+; GFX12-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:-2048 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT: v_mov_b32_e32 v4, v3
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB26_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
-; GFX940-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_pos:
+; GFX940-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: s_mov_b64 s[0:1], 0x7fe
-; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
-; GFX940-NEXT: v_and_b32_e32 v0, -4, v4
-; GFX940-NEXT: v_mov_b32_e32 v1, v5
-; GFX940-NEXT: global_load_dword v5, v[0:1], off
-; GFX940-NEXT: v_and_b32_e32 v3, 3, v4
-; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3
-; GFX940-NEXT: s_mov_b32 s0, 0xffff
-; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0
-; GFX940-NEXT: v_not_b32_e32 v6, v4
-; GFX940-NEXT: s_mov_b64 s[0:1], 0
-; GFX940-NEXT: .LBB26_1: ; %atomicrmw.start
-; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: v_lshrrev_b32_e32 v4, v3, v5
-; GFX940-NEXT: v_add_f16_e32 v4, v4, v2
-; GFX940-NEXT: v_lshlrev_b32_e32 v4, v3, v4
-; GFX940-NEXT: v_and_or_b32 v4, v5, v6, v4
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off sc0
+; GFX940-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:-2048 sc0
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5
-; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT: v_mov_b32_e32 v5, v4
-; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT: s_cbranch_execnz .LBB26_1
-; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX940-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_pos:
+; GFX11-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX11-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX11-NEXT: global_load_b32 v4, v[0:1], off
-; GFX11-NEXT: v_lshlrev_b32_e32 v5, 3, v3
-; GFX11-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_not_b32_e32 v6, v3
-; GFX11-NEXT: .LBB26_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, v5, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_f16_e32 v3, v3, v2
-; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, v5, v3
-; GFX11-NEXT: v_and_or_b32 v3, v4, v6, v3
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc
+; GFX11-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:-2048 glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT: v_mov_b32_e32 v4, v3
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB26_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
-; GFX10-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_pos:
+; GFX10-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
-; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:-2048
; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX10-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX10-NEXT: global_load_dword v4, v[0:1], off
-; GFX10-NEXT: v_lshlrev_b32_e32 v5, 3, v3
-; GFX10-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
-; GFX10-NEXT: v_not_b32_e32 v6, v3
; GFX10-NEXT: .LBB26_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_lshrrev_b32_e32 v3, v5, v4
-; GFX10-NEXT: v_add_f16_e32 v3, v3, v2
-; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-; GFX10-NEXT: v_and_or_b32 v3, v4, v6, v3
+; GFX10-NEXT: v_mov_b32_e32 v4, v3
+; GFX10-NEXT: v_add_f32_e32 v3, v4, v2
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
+; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX10-NEXT: v_mov_b32_e32 v4, v3
; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_cbranch_execnz .LBB26_1
; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT: v_mov_b32_e32 v0, v3
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX90A-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_pos:
+; GFX90A-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0
-; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX90A-NEXT: global_load_dword v5, v[0:1], off
-; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3
-; GFX90A-NEXT: s_mov_b32 s4, 0xffff
-; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4
-; GFX90A-NEXT: v_not_b32_e32 v6, v4
-; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: .LBB26_1: ; %atomicrmw.start
-; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v5
-; GFX90A-NEXT: v_add_f16_e32 v4, v4, v2
-; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4
-; GFX90A-NEXT: v_and_or_b32 v4, v5, v6, v4
-; GFX90A-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off glc
+; GFX90A-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:-2048 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5
-; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: v_mov_b32_e32 v5, v4
-; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: s_cbranch_execnz .LBB26_1
-; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
-; GFX908-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_pos:
+; GFX908-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0
-; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX908-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX908-NEXT: global_load_dword v4, v[0:1], off
-; GFX908-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX908-NEXT: v_lshlrev_b32_e32 v5, 3, v3
-; GFX908-NEXT: s_mov_b32 s4, 0xffff
-; GFX908-NEXT: v_lshlrev_b32_e64 v3, v5, s4
-; GFX908-NEXT: v_not_b32_e32 v6, v3
+; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:-2048
; GFX908-NEXT: s_mov_b64 s[4:5], 0
; GFX908-NEXT: .LBB26_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_lshrrev_b32_e32 v3, v5, v4
-; GFX908-NEXT: v_add_f16_e32 v3, v3, v2
-; GFX908-NEXT: v_lshlrev_b32_e32 v3, v5, v3
-; GFX908-NEXT: v_and_or_b32 v3, v4, v6, v3
-; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
+; GFX908-NEXT: v_mov_b32_e32 v4, v3
+; GFX908-NEXT: v_add_f32_e32 v3, v4, v2
+; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT: v_mov_b32_e32 v4, v3
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB26_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX908-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT: v_mov_b32_e32 v0, v3
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
-; GFX8-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_pos:
+; GFX8-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fe, v0
-; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX8-NEXT: flat_load_dword v4, v[0:1]
-; GFX8-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX8-NEXT: v_lshlrev_b32_e32 v5, 3, v3
-; GFX8-NEXT: s_mov_b32 s4, 0xffff
-; GFX8-NEXT: v_lshlrev_b32_e64 v3, v5, s4
-; GFX8-NEXT: v_not_b32_e32 v6, v3
+; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0xfffff800, v0
+; GFX8-NEXT: v_addc_u32_e32 v4, vcc, -1, v1, vcc
+; GFX8-NEXT: flat_load_dword v0, v[3:4]
; GFX8-NEXT: s_mov_b64 s[4:5], 0
; GFX8-NEXT: .LBB26_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_lshrrev_b32_e32 v3, v5, v4
-; GFX8-NEXT: v_add_f16_e32 v3, v3, v2
-; GFX8-NEXT: v_and_b32_e32 v7, v4, v6
-; GFX8-NEXT: v_lshlrev_b32_e32 v3, v5, v3
-; GFX8-NEXT: v_or_b32_e32 v3, v7, v3
-; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT: v_mov_b32_e32 v1, v0
+; GFX8-NEXT: v_add_f32_e32 v0, v1, v2
+; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT: v_mov_b32_e32 v4, v3
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB26_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX7-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_pos:
+; GFX7-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0
-; GFX7-NEXT: s_mov_b32 s6, 0
-; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX7-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX7-NEXT: s_movk_i32 s4, 0xf800
+; GFX7-NEXT: v_mov_b32_e32 v4, v1
+; GFX7-NEXT: v_mov_b32_e32 v3, v0
+; GFX7-NEXT: s_mov_b32 s5, -1
; GFX7-NEXT: s_mov_b32 s7, 0xf000
+; GFX7-NEXT: s_mov_b32 s6, 0
+; GFX7-NEXT: buffer_load_dword v0, v[3:4], s[4:7], 0 addr64
+; GFX7-NEXT: v_add_i32_e32 v3, vcc, 0xfffff800, v3
+; GFX7-NEXT: v_addc_u32_e32 v4, vcc, -1, v4, vcc
+; GFX7-NEXT: s_mov_b64 s[8:9], 0
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
-; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX7-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4
-; GFX7-NEXT: s_mov_b64 s[8:9], 0
-; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2
-; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v4
-; GFX7-NEXT: v_not_b32_e32 v6, v2
; GFX7-NEXT: .LBB26_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3
-; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
-; GFX7-NEXT: v_and_b32_e32 v7, v3, v6
-; GFX7-NEXT: v_add_f32_e32 v2, v2, v5
-; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX7-NEXT: v_lshlrev_b32_e32 v2, v4, v2
-; GFX7-NEXT: v_or_b32_e32 v2, v7, v2
-; GFX7-NEXT: v_mov_b32_e32 v8, v3
-; GFX7-NEXT: v_mov_b32_e32 v7, v2
-; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT: v_mov_b32_e32 v6, v0
+; GFX7-NEXT: v_add_f32_e32 v5, v6, v2
+; GFX7-NEXT: v_mov_b32_e32 v0, v5
+; GFX7-NEXT: v_mov_b32_e32 v1, v6
+; GFX7-NEXT: buffer_atomic_cmpswap v[0:1], v[3:4], s[4:7], 0 addr64 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6
; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT: v_mov_b32_e32 v3, v7
; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_cbranch_execnz .LBB26_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
-; GFX6-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_pos:
+; GFX6-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0
-; GFX6-NEXT: s_mov_b32 s6, 0
-; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX6-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX6-NEXT: s_movk_i32 s4, 0xf800
+; GFX6-NEXT: v_mov_b32_e32 v4, v1
+; GFX6-NEXT: v_mov_b32_e32 v3, v0
+; GFX6-NEXT: s_mov_b32 s5, -1
; GFX6-NEXT: s_mov_b32 s7, 0xf000
+; GFX6-NEXT: s_mov_b32 s6, 0
+; GFX6-NEXT: buffer_load_dword v0, v[3:4], s[4:7], 0 addr64
+; GFX6-NEXT: v_add_i32_e32 v3, vcc, 0xfffff800, v3
+; GFX6-NEXT: v_addc_u32_e32 v4, vcc, -1, v4, vcc
+; GFX6-NEXT: s_mov_b64 s[8:9], 0
; GFX6-NEXT: s_mov_b32 s4, s6
; GFX6-NEXT: s_mov_b32 s5, s6
-; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX6-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX6-NEXT: v_lshlrev_b32_e32 v4, 3, v4
-; GFX6-NEXT: s_mov_b64 s[8:9], 0
-; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v2
-; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v4
-; GFX6-NEXT: v_not_b32_e32 v6, v2
; GFX6-NEXT: .LBB26_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_lshrrev_b32_e32 v2, v4, v3
-; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GFX6-NEXT: v_mov_b32_e32 v6, v0
+; GFX6-NEXT: v_add_f32_e32 v5, v6, v2
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v7, v3, v6
-; GFX6-NEXT: v_add_f32_e32 v2, v2, v5
-; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX6-NEXT: v_lshlrev_b32_e32 v2, v4, v2
-; GFX6-NEXT: v_or_b32_e32 v2, v7, v2
-; GFX6-NEXT: v_mov_b32_e32 v8, v3
-; GFX6-NEXT: v_mov_b32_e32 v7, v2
-; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT: v_mov_b32_e32 v0, v5
+; GFX6-NEXT: v_mov_b32_e32 v1, v6
+; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], v[3:4], s[4:7], 0 addr64 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6
; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT: v_mov_b32_e32 v3, v7
; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_cbranch_execnz .LBB26_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
- %gep = getelementptr half, ptr addrspace(1) %ptr, i64 1023
- %unused = atomicrmw fadd ptr addrspace(1) %gep, half %val syncscope("agent") seq_cst
- ret void
+ %gep = getelementptr float, ptr addrspace(1) %ptr, i64 -512
+ %result = atomicrmw fadd ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+ ret float %result
}
-define void @global_agent_atomic_fadd_noret_f16__offset12b_neg(ptr addrspace(1) %ptr, half %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_neg:
+define void @global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #1 {
+; GFX12-LABEL: global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_fine_grained_memory:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: s_wait_expcnt 0x0
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
-; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX12-NEXT: global_load_b32 v4, v[0:1], off
-; GFX12-NEXT: v_lshlrev_b32_e32 v5, 3, v3
-; GFX12-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_not_b32_e32 v6, v3
-; GFX12-NEXT: .LBB27_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_lshrrev_b32_e32 v3, v5, v4
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_add_f16_e32 v3, v3, v2
-; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3
-; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN
-; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: global_atomic_add_f32 v[0:1], v2, off
+; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT: v_mov_b32_e32 v4, v3
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB27_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
-; GFX940-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_neg:
+; GFX940-LABEL: global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_fine_grained_memory:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: s_movk_i32 s0, 0xf800
-; GFX940-NEXT: s_mov_b32 s1, -1
-; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
-; GFX940-NEXT: v_and_b32_e32 v0, -4, v4
-; GFX940-NEXT: v_mov_b32_e32 v1, v5
-; GFX940-NEXT: global_load_dword v5, v[0:1], off
-; GFX940-NEXT: v_and_b32_e32 v3, 3, v4
-; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3
-; GFX940-NEXT: s_mov_b32 s0, 0xffff
-; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0
-; GFX940-NEXT: v_not_b32_e32 v6, v4
-; GFX940-NEXT: s_mov_b64 s[0:1], 0
-; GFX940-NEXT: .LBB27_1: ; %atomicrmw.start
-; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: v_lshrrev_b32_e32 v4, v3, v5
-; GFX940-NEXT: v_add_f16_e32 v4, v4, v2
-; GFX940-NEXT: v_lshlrev_b32_e32 v4, v3, v4
-; GFX940-NEXT: v_and_or_b32 v4, v5, v6, v4
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off sc0
+; GFX940-NEXT: global_atomic_add_f32 v[0:1], v2, off
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5
-; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT: v_mov_b32_e32 v5, v4
-; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT: s_cbranch_execnz .LBB27_1
-; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX940-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_neg:
+; GFX11-LABEL: global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_fine_grained_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX11-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX11-NEXT: global_load_b32 v4, v[0:1], off
-; GFX11-NEXT: v_lshlrev_b32_e32 v5, 3, v3
-; GFX11-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_not_b32_e32 v6, v3
-; GFX11-NEXT: .LBB27_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, v5, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_f16_e32 v3, v3, v2
-; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, v5, v3
-; GFX11-NEXT: v_and_or_b32 v3, v4, v6, v3
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: global_atomic_add_f32 v[0:1], v2, off
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT: v_mov_b32_e32 v4, v3
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB27_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
-; GFX10-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_neg:
+; GFX10-LABEL: global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
-; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
-; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX10-NEXT: v_and_b32_e32 v3, 3, v3
; GFX10-NEXT: global_load_dword v4, v[0:1], off
-; GFX10-NEXT: v_lshlrev_b32_e32 v5, 3, v3
-; GFX10-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
-; GFX10-NEXT: v_not_b32_e32 v6, v3
+; GFX10-NEXT: s_mov_b32 s4, 0
; GFX10-NEXT: .LBB27_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_lshrrev_b32_e32 v3, v5, v4
-; GFX10-NEXT: v_add_f16_e32 v3, v3, v2
-; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-; GFX10-NEXT: v_and_or_b32 v3, v4, v6, v3
+; GFX10-NEXT: v_add_f32_e32 v3, v4, v2
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
@@ -6012,91 +4623,31 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b_neg(ptr addrspace(1)
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX90A-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_neg:
+; GFX90A-LABEL: global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
-; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
-; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX90A-NEXT: global_load_dword v5, v[0:1], off
-; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3
-; GFX90A-NEXT: s_mov_b32 s4, 0xffff
-; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4
-; GFX90A-NEXT: v_not_b32_e32 v6, v4
-; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: .LBB27_1: ; %atomicrmw.start
-; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v5
-; GFX90A-NEXT: v_add_f16_e32 v4, v4, v2
-; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4
-; GFX90A-NEXT: v_and_or_b32 v4, v5, v6, v4
-; GFX90A-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off glc
+; GFX90A-NEXT: global_atomic_add_f32 v[0:1], v2, off
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5
-; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: v_mov_b32_e32 v5, v4
-; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: s_cbranch_execnz .LBB27_1
-; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
-; GFX908-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_neg:
+; GFX908-LABEL: global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_fine_grained_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
-; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
-; GFX908-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX908-NEXT: global_load_dword v4, v[0:1], off
-; GFX908-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX908-NEXT: v_lshlrev_b32_e32 v5, 3, v3
-; GFX908-NEXT: s_mov_b32 s4, 0xffff
-; GFX908-NEXT: v_lshlrev_b32_e64 v3, v5, s4
-; GFX908-NEXT: v_not_b32_e32 v6, v3
-; GFX908-NEXT: s_mov_b64 s[4:5], 0
-; GFX908-NEXT: .LBB27_1: ; %atomicrmw.start
-; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_lshrrev_b32_e32 v3, v5, v4
-; GFX908-NEXT: v_add_f16_e32 v3, v3, v2
-; GFX908-NEXT: v_lshlrev_b32_e32 v3, v5, v3
-; GFX908-NEXT: v_and_or_b32 v3, v4, v6, v3
-; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
+; GFX908-NEXT: global_atomic_add_f32 v[0:1], v2, off
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
-; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT: v_mov_b32_e32 v4, v3
-; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT: s_cbranch_execnz .LBB27_1
-; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX908-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
-; GFX8-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_neg:
+; GFX8-LABEL: global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_fine_grained_memory:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0xfffff800, v0
-; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc
-; GFX8-NEXT: v_and_b32_e32 v0, -4, v3
; GFX8-NEXT: flat_load_dword v4, v[0:1]
-; GFX8-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX8-NEXT: v_lshlrev_b32_e32 v5, 3, v3
-; GFX8-NEXT: s_mov_b32 s4, 0xffff
-; GFX8-NEXT: v_lshlrev_b32_e64 v3, v5, s4
-; GFX8-NEXT: v_not_b32_e32 v6, v3
; GFX8-NEXT: s_mov_b64 s[4:5], 0
; GFX8-NEXT: .LBB27_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_lshrrev_b32_e32 v3, v5, v4
-; GFX8-NEXT: v_add_f16_e32 v3, v3, v2
-; GFX8-NEXT: v_and_b32_e32 v7, v4, v6
-; GFX8-NEXT: v_lshlrev_b32_e32 v3, v5, v3
-; GFX8-NEXT: v_or_b32_e32 v3, v7, v3
+; GFX8-NEXT: v_add_f32_e32 v3, v4, v2
; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
@@ -6109,451 +4660,264 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b_neg(ptr addrspace(1)
; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX7-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_neg:
+; GFX7-LABEL: global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_fine_grained_memory:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0xfffff800, v0
; GFX7-NEXT: s_mov_b32 s6, 0
-; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc
-; GFX7-NEXT: v_and_b32_e32 v0, -4, v4
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
-; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX7-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64
; GFX7-NEXT: s_mov_b64 s[8:9], 0
-; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2
-; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v4
-; GFX7-NEXT: v_not_b32_e32 v6, v2
; GFX7-NEXT: .LBB27_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3
-; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
-; GFX7-NEXT: v_and_b32_e32 v7, v3, v6
-; GFX7-NEXT: v_add_f32_e32 v2, v2, v5
-; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX7-NEXT: v_lshlrev_b32_e32 v2, v4, v2
-; GFX7-NEXT: v_or_b32_e32 v2, v7, v2
-; GFX7-NEXT: v_mov_b32_e32 v8, v3
-; GFX7-NEXT: v_mov_b32_e32 v7, v2
-; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT: v_add_f32_e32 v3, v4, v2
+; GFX7-NEXT: v_mov_b32_e32 v6, v4
+; GFX7-NEXT: v_mov_b32_e32 v5, v3
+; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4
; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT: v_mov_b32_e32 v3, v7
+; GFX7-NEXT: v_mov_b32_e32 v4, v5
; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_cbranch_execnz .LBB27_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
-; GFX6-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_neg:
+; GFX6-LABEL: global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_fine_grained_memory:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0xfffff800, v0
; GFX6-NEXT: s_mov_b32 s6, 0
-; GFX6-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc
-; GFX6-NEXT: v_and_b32_e32 v0, -4, v4
; GFX6-NEXT: s_mov_b32 s7, 0xf000
; GFX6-NEXT: s_mov_b32 s4, s6
; GFX6-NEXT: s_mov_b32 s5, s6
-; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX6-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX6-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64
; GFX6-NEXT: s_mov_b64 s[8:9], 0
-; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v2
-; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v4
-; GFX6-NEXT: v_not_b32_e32 v6, v2
; GFX6-NEXT: .LBB27_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_lshrrev_b32_e32 v2, v4, v3
-; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GFX6-NEXT: v_add_f32_e32 v3, v4, v2
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v7, v3, v6
-; GFX6-NEXT: v_add_f32_e32 v2, v2, v5
-; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX6-NEXT: v_lshlrev_b32_e32 v2, v4, v2
-; GFX6-NEXT: v_or_b32_e32 v2, v7, v2
-; GFX6-NEXT: v_mov_b32_e32 v8, v3
-; GFX6-NEXT: v_mov_b32_e32 v7, v2
-; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT: v_mov_b32_e32 v6, v4
+; GFX6-NEXT: v_mov_b32_e32 v5, v3
+; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4
; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT: v_mov_b32_e32 v3, v7
+; GFX6-NEXT: v_mov_b32_e32 v4, v5
; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_cbranch_execnz .LBB27_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
- %gep = getelementptr half, ptr addrspace(1) %ptr, i64 -1024
- %unused = atomicrmw fadd ptr addrspace(1) %gep, half %val syncscope("agent") seq_cst
+ %unused = atomicrmw fadd ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
ret void
}
-define half @global_agent_atomic_fadd_ret_f16__offset12b_pos__align4(ptr addrspace(1) %ptr, half %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos__align4:
+define void @global_agent_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #1 {
+; GFX12-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: s_wait_expcnt 0x0
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2046
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: .LBB28_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v4, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_add_f16_e32 v3, v4, v2
-; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 th:TH_ATOMIC_RETURN
-; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044
+; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB28_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: v_mov_b32_e32 v0, v3
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
-; GFX940-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos__align4:
+; GFX940-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2046
-; GFX940-NEXT: s_mov_b64 s[0:1], 0
-; GFX940-NEXT: s_mov_b32 s2, 0xffff0000
-; GFX940-NEXT: .LBB28_1: ; %atomicrmw.start
-; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: v_mov_b32_e32 v5, v3
-; GFX940-NEXT: v_add_f16_e32 v3, v5, v2
-; GFX940-NEXT: v_and_or_b32 v4, v5, s2, v3
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2046 sc0
+; GFX940-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
-; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT: s_cbranch_execnz .LBB28_1
-; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX940-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX940-NEXT: v_mov_b32_e32 v0, v3
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos__align4:
+; GFX11-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2046
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: .LBB28_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v4, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_f16_e32 v3, v4, v2
-; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 glc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB28_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: v_mov_b32_e32 v0, v3
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
-; GFX10-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos__align4:
+; GFX10-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2046
+; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:2044
; GFX10-NEXT: s_mov_b32 s4, 0
; GFX10-NEXT: .LBB28_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v4, v3
-; GFX10-NEXT: v_add_f16_e32 v3, v4, v2
-; GFX10-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX10-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
+; GFX10-NEXT: v_add_f32_e32 v3, v4, v2
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2046 glc
+; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX10-NEXT: v_mov_b32_e32 v4, v3
; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_cbranch_execnz .LBB28_1
; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: v_mov_b32_e32 v0, v3
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX90A-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos__align4:
+; GFX90A-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2046
-; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: s_mov_b32 s6, 0xffff0000
-; GFX90A-NEXT: .LBB28_1: ; %atomicrmw.start
-; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v5, v3
-; GFX90A-NEXT: v_add_f16_e32 v3, v5, v2
-; GFX90A-NEXT: v_and_or_b32 v4, v5, s6, v3
-; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2046 glc
+; GFX90A-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
-; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: s_cbranch_execnz .LBB28_1
-; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: v_mov_b32_e32 v0, v3
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
-; GFX908-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos__align4:
+; GFX908-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2046
-; GFX908-NEXT: s_mov_b64 s[4:5], 0
-; GFX908-NEXT: s_mov_b32 s6, 0xffff0000
-; GFX908-NEXT: .LBB28_1: ; %atomicrmw.start
-; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v4, v3
-; GFX908-NEXT: v_add_f16_e32 v3, v4, v2
-; GFX908-NEXT: v_and_or_b32 v3, v4, s6, v3
-; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2046 glc
+; GFX908-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
-; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT: s_cbranch_execnz .LBB28_1
-; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX908-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX908-NEXT: v_mov_b32_e32 v0, v3
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
-; GFX8-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos__align4:
+; GFX8-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fe, v0
-; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_load_dword v0, v[3:4]
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT: flat_load_dword v4, v[0:1]
; GFX8-NEXT: s_mov_b64 s[4:5], 0
; GFX8-NEXT: .LBB28_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, v0
-; GFX8-NEXT: v_add_f16_e32 v0, v1, v2
-; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
-; GFX8-NEXT: v_or_b32_e32 v0, v5, v0
-; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
+; GFX8-NEXT: v_add_f32_e32 v3, v4, v2
+; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT: v_mov_b32_e32 v4, v3
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB28_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX7-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos__align4:
+; GFX7-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: s_mov_b32 s6, 0
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
-; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046
-; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:2044
; GFX7-NEXT: s_mov_b64 s[8:9], 0
-; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v2
; GFX7-NEXT: .LBB28_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v3
-; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v3
-; GFX7-NEXT: v_add_f32_e32 v2, v2, v6
-; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX7-NEXT: v_or_b32_e32 v2, v4, v2
+; GFX7-NEXT: v_add_f32_e32 v3, v4, v2
+; GFX7-NEXT: v_mov_b32_e32 v6, v4
; GFX7-NEXT: v_mov_b32_e32 v5, v3
-; GFX7-NEXT: v_mov_b32_e32 v4, v2
-; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 offset:2046 glc
+; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2044 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4
; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT: v_mov_b32_e32 v3, v4
+; GFX7-NEXT: v_mov_b32_e32 v4, v5
; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_cbranch_execnz .LBB28_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
-; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v4
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
-; GFX6-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos__align4:
+; GFX6-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: s_mov_b32 s6, 0
; GFX6-NEXT: s_mov_b32 s7, 0xf000
; GFX6-NEXT: s_mov_b32 s4, s6
; GFX6-NEXT: s_mov_b32 s5, s6
-; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046
-; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:2044
; GFX6-NEXT: s_mov_b64 s[8:9], 0
-; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v2
; GFX6-NEXT: .LBB28_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v3
+; GFX6-NEXT: v_add_f32_e32 v3, v4, v2
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v3
-; GFX6-NEXT: v_add_f32_e32 v2, v2, v6
-; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX6-NEXT: v_or_b32_e32 v2, v4, v2
+; GFX6-NEXT: v_mov_b32_e32 v6, v4
; GFX6-NEXT: v_mov_b32_e32 v5, v3
-; GFX6-NEXT: v_mov_b32_e32 v4, v2
-; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 offset:2046 glc
+; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2044 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4
; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT: v_mov_b32_e32 v3, v4
+; GFX6-NEXT: v_mov_b32_e32 v4, v5
; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_cbranch_execnz .LBB28_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
-; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v4
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
- %gep = getelementptr half, ptr addrspace(1) %ptr, i64 1023
- %result = atomicrmw fadd ptr addrspace(1) %gep, half %val syncscope("agent") seq_cst, align 4
- ret half %result
+ %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511
+ %unused = atomicrmw fadd ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+ ret void
}
-define void @global_agent_atomic_fadd_noret_f16__offset12b__align4_pos(ptr addrspace(1) %ptr, half %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fadd_noret_f16__offset12b__align4_pos:
+define void @global_agent_atomic_fadd_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #1 {
+; GFX12-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: s_wait_expcnt 0x0
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v4, v[0:1], off offset:2046
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: .LBB29_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_add_f16_e32 v3, v4, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX12-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 th:TH_ATOMIC_RETURN
-; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:-2048
+; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT: v_mov_b32_e32 v4, v3
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB29_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
-; GFX940-LABEL: global_agent_atomic_fadd_noret_f16__offset12b__align4_pos:
+; GFX940-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: global_load_dword v5, v[0:1], off offset:2046
-; GFX940-NEXT: s_mov_b64 s[0:1], 0
-; GFX940-NEXT: s_mov_b32 s2, 0xffff0000
-; GFX940-NEXT: .LBB29_1: ; %atomicrmw.start
-; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: v_add_f16_e32 v3, v5, v2
-; GFX940-NEXT: v_and_or_b32 v4, v5, s2, v3
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2046 sc0
+; GFX940-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:-2048
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
-; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT: v_mov_b32_e32 v5, v3
-; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT: s_cbranch_execnz .LBB29_1
-; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX940-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: global_agent_atomic_fadd_noret_f16__offset12b__align4_pos:
+; GFX11-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v4, v[0:1], off offset:2046
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: .LBB29_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_add_f16_e32 v3, v4, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 glc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:-2048
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT: v_mov_b32_e32 v4, v3
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB29_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
-; GFX10-LABEL: global_agent_atomic_fadd_noret_f16__offset12b__align4_pos:
+; GFX10-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:2046
+; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:-2048
; GFX10-NEXT: s_mov_b32 s4, 0
; GFX10-NEXT: .LBB29_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_add_f16_e32 v3, v4, v2
-; GFX10-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX10-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
+; GFX10-NEXT: v_add_f32_e32 v3, v4, v2
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2046 glc
+; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
@@ -6566,65 +4930,33 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b__align4_pos(ptr addrs
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX90A-LABEL: global_agent_atomic_fadd_noret_f16__offset12b__align4_pos:
+; GFX90A-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:2046
-; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: s_mov_b32 s6, 0xffff0000
-; GFX90A-NEXT: .LBB29_1: ; %atomicrmw.start
-; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_add_f16_e32 v3, v5, v2
-; GFX90A-NEXT: v_and_or_b32 v4, v5, s6, v3
-; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2046 glc
+; GFX90A-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:-2048
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
-; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: v_mov_b32_e32 v5, v3
-; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: s_cbranch_execnz .LBB29_1
-; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
-; GFX908-LABEL: global_agent_atomic_fadd_noret_f16__offset12b__align4_pos:
+; GFX908-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: global_load_dword v4, v[0:1], off offset:2046
-; GFX908-NEXT: s_mov_b64 s[4:5], 0
-; GFX908-NEXT: s_mov_b32 s6, 0xffff0000
-; GFX908-NEXT: .LBB29_1: ; %atomicrmw.start
-; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_add_f16_e32 v3, v4, v2
-; GFX908-NEXT: v_and_or_b32 v3, v4, s6, v3
-; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2046 glc
+; GFX908-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:-2048
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
-; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT: v_mov_b32_e32 v4, v3
-; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT: s_cbranch_execnz .LBB29_1
-; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX908-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
-; GFX8-LABEL: global_agent_atomic_fadd_noret_f16__offset12b__align4_pos:
+; GFX8-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fe, v0
-; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0xfffff800, v0
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc
; GFX8-NEXT: flat_load_dword v4, v[0:1]
; GFX8-NEXT: s_mov_b64 s[4:5], 0
; GFX8-NEXT: .LBB29_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_add_f16_e32 v3, v4, v2
-; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
-; GFX8-NEXT: v_or_b32_e32 v3, v5, v3
+; GFX8-NEXT: v_add_f32_e32 v3, v4, v2
; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
@@ -6637,460 +4969,310 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b__align4_pos(ptr addrs
; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX7-LABEL: global_agent_atomic_fadd_noret_f16__offset12b__align4_pos:
+; GFX7-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: s_mov_b32 s6, 0
+; GFX7-NEXT: s_movk_i32 s4, 0xf800
+; GFX7-NEXT: s_mov_b32 s5, -1
; GFX7-NEXT: s_mov_b32 s7, 0xf000
+; GFX7-NEXT: s_mov_b32 s6, 0
+; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0
+; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc
+; GFX7-NEXT: s_mov_b64 s[8:9], 0
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
-; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046
-; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX7-NEXT: s_mov_b64 s[8:9], 0
-; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v2
; GFX7-NEXT: .LBB29_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v3
-; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
-; GFX7-NEXT: v_add_f32_e32 v2, v2, v4
-; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX7-NEXT: v_or_b32_e32 v2, v5, v2
-; GFX7-NEXT: v_mov_b32_e32 v6, v3
-; GFX7-NEXT: v_mov_b32_e32 v5, v2
-; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2046 glc
+; GFX7-NEXT: v_add_f32_e32 v3, v4, v2
+; GFX7-NEXT: v_mov_b32_e32 v6, v4
+; GFX7-NEXT: v_mov_b32_e32 v5, v3
+; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v3
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4
; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT: v_mov_b32_e32 v3, v5
+; GFX7-NEXT: v_mov_b32_e32 v4, v5
; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_cbranch_execnz .LBB29_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
-; GFX6-LABEL: global_agent_atomic_fadd_noret_f16__offset12b__align4_pos:
+; GFX6-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT: s_mov_b32 s6, 0
+; GFX6-NEXT: s_movk_i32 s4, 0xf800
+; GFX6-NEXT: s_mov_b32 s5, -1
; GFX6-NEXT: s_mov_b32 s7, 0xf000
+; GFX6-NEXT: s_mov_b32 s6, 0
+; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64
+; GFX6-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0
+; GFX6-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc
+; GFX6-NEXT: s_mov_b64 s[8:9], 0
; GFX6-NEXT: s_mov_b32 s4, s6
; GFX6-NEXT: s_mov_b32 s5, s6
-; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046
-; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX6-NEXT: s_mov_b64 s[8:9], 0
-; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v2
; GFX6-NEXT: .LBB29_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v3
+; GFX6-NEXT: v_add_f32_e32 v3, v4, v2
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
-; GFX6-NEXT: v_add_f32_e32 v2, v2, v4
-; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX6-NEXT: v_or_b32_e32 v2, v5, v2
-; GFX6-NEXT: v_mov_b32_e32 v6, v3
-; GFX6-NEXT: v_mov_b32_e32 v5, v2
-; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2046 glc
+; GFX6-NEXT: v_mov_b32_e32 v6, v4
+; GFX6-NEXT: v_mov_b32_e32 v5, v3
+; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v3
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4
; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT: v_mov_b32_e32 v3, v5
+; GFX6-NEXT: v_mov_b32_e32 v4, v5
; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_cbranch_execnz .LBB29_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
- %gep = getelementptr half, ptr addrspace(1) %ptr, i64 1023
- %unused = atomicrmw fadd ptr addrspace(1) %gep, half %val syncscope("agent") seq_cst, align 4
+ %gep = getelementptr float, ptr addrspace(1) %ptr, i64 -512
+ %unused = atomicrmw fadd ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
ret void
}
-define half @global_system_atomic_fadd_ret_f16__offset12b_pos(ptr addrspace(1) %ptr, half %val) #0 {
-; GFX12-LABEL: global_system_atomic_fadd_ret_f16__offset12b_pos:
+define float @global_system_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #1 {
+; GFX12-LABEL: global_system_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: s_wait_expcnt 0x0
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
-; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2044
; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX12-NEXT: global_load_b32 v5, v[0:1], off
-; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3
-; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_not_b32_e32 v4, v4
; GFX12-NEXT: .LBB30_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v6, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6
-; GFX12-NEXT: v_add_f16_e32 v5, v5, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX12-NEXT: v_mov_b32_e32 v4, v3
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX12-NEXT: v_add_f32_e32 v3, v4, v2
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN
+; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SYS
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB30_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-NEXT: v_mov_b32_e32 v0, v3
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
-; GFX940-LABEL: global_system_atomic_fadd_ret_f16__offset12b_pos:
+; GFX940-LABEL: global_system_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: s_mov_b64 s[0:1], 0x7fe
-; GFX940-NEXT: v_lshl_add_u64 v[6:7], v[0:1], 0, s[0:1]
-; GFX940-NEXT: v_and_b32_e32 v0, -4, v6
-; GFX940-NEXT: v_mov_b32_e32 v1, v7
-; GFX940-NEXT: global_load_dword v4, v[0:1], off
-; GFX940-NEXT: v_and_b32_e32 v3, 3, v6
-; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3
-; GFX940-NEXT: s_mov_b32 s0, 0xffff
-; GFX940-NEXT: v_lshlrev_b32_e64 v5, v3, s0
-; GFX940-NEXT: v_not_b32_e32 v5, v5
-; GFX940-NEXT: s_mov_b64 s[0:1], 0
-; GFX940-NEXT: .LBB30_1: ; %atomicrmw.start
-; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: v_mov_b32_e32 v7, v4
-; GFX940-NEXT: v_lshrrev_b32_e32 v4, v3, v7
-; GFX940-NEXT: v_add_f16_e32 v4, v4, v2
-; GFX940-NEXT: v_lshlrev_b32_e32 v4, v3, v4
-; GFX940-NEXT: v_and_or_b32 v6, v7, v5, v4
; GFX940-NEXT: buffer_wbl2 sc0 sc1
-; GFX940-NEXT: global_atomic_cmpswap v4, v[0:1], v[6:7], off sc0 sc1
+; GFX940-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:2044 sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc0 sc1
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7
-; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT: s_cbranch_execnz .LBB30_1
-; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX940-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v4
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: global_system_atomic_fadd_ret_f16__offset12b_pos:
+; GFX11-LABEL: global_system_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044
; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX11-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX11-NEXT: global_load_b32 v5, v[0:1], off
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3
-; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_not_b32_e32 v4, v4
; GFX11-NEXT: .LBB30_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v6, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6
-; GFX11-NEXT: v_add_f16_e32 v5, v5, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5
-; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX11-NEXT: v_mov_b32_e32 v4, v3
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX11-NEXT: v_add_f32_e32 v3, v4, v2
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc
+; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: s_cbranch_execnz .LBB30_1
; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX11-NEXT: v_mov_b32_e32 v0, v3
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
-; GFX10-LABEL: global_system_atomic_fadd_ret_f16__offset12b_pos:
+; GFX10-LABEL: global_system_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
-; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044
; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX10-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX10-NEXT: global_load_dword v5, v[0:1], off
-; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3
-; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX10-NEXT: v_not_b32_e32 v4, v4
; GFX10-NEXT: .LBB30_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v6, v5
-; GFX10-NEXT: v_lshrrev_b32_e32 v5, v3, v6
-; GFX10-NEXT: v_add_f16_e32 v5, v5, v2
-; GFX10-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-; GFX10-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX10-NEXT: v_mov_b32_e32 v4, v3
+; GFX10-NEXT: v_add_f32_e32 v3, v4, v2
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: global_atomic_cmpswap v5, v[0:1], v[5:6], off glc
+; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_cbranch_execnz .LBB30_1
; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX10-NEXT: v_mov_b32_e32 v0, v3
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX90A-LABEL: global_system_atomic_fadd_ret_f16__offset12b_pos:
+; GFX90A-LABEL: global_system_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0
-; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX90A-NEXT: global_load_dword v4, v[0:1], off
-; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3
-; GFX90A-NEXT: s_mov_b32 s4, 0xffff
-; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v3, s4
-; GFX90A-NEXT: v_not_b32_e32 v5, v5
+; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: .LBB30_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v7, v4
-; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v7
-; GFX90A-NEXT: v_add_f16_e32 v4, v4, v2
-; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4
-; GFX90A-NEXT: v_and_or_b32 v6, v7, v5, v4
+; GFX90A-NEXT: v_mov_b32_e32 v5, v3
+; GFX90A-NEXT: v_add_f32_e32 v4, v5, v2
; GFX90A-NEXT: buffer_wbl2
-; GFX90A-NEXT: global_atomic_cmpswap v4, v[0:1], v[6:7], off glc
+; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_invl2
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB30_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v4
+; GFX90A-NEXT: v_mov_b32_e32 v0, v3
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
-; GFX908-LABEL: global_system_atomic_fadd_ret_f16__offset12b_pos:
+; GFX908-LABEL: global_system_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0
-; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX908-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX908-NEXT: global_load_dword v4, v[0:1], off
-; GFX908-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX908-NEXT: v_lshlrev_b32_e32 v3, 3, v3
-; GFX908-NEXT: s_mov_b32 s4, 0xffff
-; GFX908-NEXT: v_lshlrev_b32_e64 v5, v3, s4
-; GFX908-NEXT: v_not_b32_e32 v5, v5
+; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044
; GFX908-NEXT: s_mov_b64 s[4:5], 0
; GFX908-NEXT: .LBB30_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v7, v4
-; GFX908-NEXT: v_lshrrev_b32_e32 v4, v3, v7
-; GFX908-NEXT: v_add_f16_e32 v4, v4, v2
-; GFX908-NEXT: v_lshlrev_b32_e32 v4, v3, v4
-; GFX908-NEXT: v_and_or_b32 v6, v7, v5, v4
-; GFX908-NEXT: global_atomic_cmpswap v4, v[0:1], v[6:7], off glc
+; GFX908-NEXT: v_mov_b32_e32 v4, v3
+; GFX908-NEXT: v_add_f32_e32 v3, v4, v2
+; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB30_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX908-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v4
+; GFX908-NEXT: v_mov_b32_e32 v0, v3
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
-; GFX8-LABEL: global_system_atomic_fadd_ret_f16__offset12b_pos:
+; GFX8-LABEL: global_system_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fe, v0
-; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX8-NEXT: flat_load_dword v5, v[0:1]
-; GFX8-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX8-NEXT: v_lshlrev_b32_e32 v3, 3, v3
-; GFX8-NEXT: s_mov_b32 s4, 0xffff
-; GFX8-NEXT: v_lshlrev_b32_e64 v4, v3, s4
-; GFX8-NEXT: v_not_b32_e32 v4, v4
+; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fc, v0
+; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc
+; GFX8-NEXT: flat_load_dword v0, v[3:4]
; GFX8-NEXT: s_mov_b64 s[4:5], 0
; GFX8-NEXT: .LBB30_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v6, v5
-; GFX8-NEXT: v_lshrrev_b32_e32 v5, v3, v6
-; GFX8-NEXT: v_add_f16_e32 v5, v5, v2
-; GFX8-NEXT: v_and_b32_e32 v7, v6, v4
-; GFX8-NEXT: v_lshlrev_b32_e32 v5, v3, v5
-; GFX8-NEXT: v_or_b32_e32 v5, v7, v5
-; GFX8-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
+; GFX8-NEXT: v_mov_b32_e32 v1, v0
+; GFX8-NEXT: v_add_f32_e32 v0, v1, v2
+; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB30_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, v3, v5
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX7-LABEL: global_system_atomic_fadd_ret_f16__offset12b_pos:
+; GFX7-LABEL: global_system_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0
; GFX7-NEXT: s_mov_b32 s6, 0
-; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX7-NEXT: v_and_b32_e32 v0, -4, v4
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
-; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX7-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX7-NEXT: v_lshlrev_b32_e32 v6, 3, v4
+; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044
; GFX7-NEXT: s_mov_b64 s[8:9], 0
-; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v2
-; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v6
-; GFX7-NEXT: v_not_b32_e32 v8, v2
; GFX7-NEXT: .LBB30_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v2, v6, v3
-; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
-; GFX7-NEXT: v_and_b32_e32 v4, v3, v8
-; GFX7-NEXT: v_add_f32_e32 v2, v2, v7
-; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX7-NEXT: v_lshlrev_b32_e32 v2, v6, v2
-; GFX7-NEXT: v_or_b32_e32 v2, v4, v2
; GFX7-NEXT: v_mov_b32_e32 v5, v3
-; GFX7-NEXT: v_mov_b32_e32 v4, v2
-; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT: v_add_f32_e32 v4, v5, v2
+; GFX7-NEXT: v_mov_b32_e32 v3, v4
+; GFX7-NEXT: v_mov_b32_e32 v4, v5
+; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT: v_mov_b32_e32 v3, v4
; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_cbranch_execnz .LBB30_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, v6, v4
-; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT: v_mov_b32_e32 v0, v3
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
-; GFX6-LABEL: global_system_atomic_fadd_ret_f16__offset12b_pos:
+; GFX6-LABEL: global_system_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0
; GFX6-NEXT: s_mov_b32 s6, 0
-; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX6-NEXT: v_and_b32_e32 v0, -4, v4
; GFX6-NEXT: s_mov_b32 s7, 0xf000
; GFX6-NEXT: s_mov_b32 s4, s6
; GFX6-NEXT: s_mov_b32 s5, s6
-; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX6-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX6-NEXT: v_lshlrev_b32_e32 v6, 3, v4
+; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044
; GFX6-NEXT: s_mov_b64 s[8:9], 0
-; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v2
-; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v6
-; GFX6-NEXT: v_not_b32_e32 v8, v2
; GFX6-NEXT: .LBB30_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_lshrrev_b32_e32 v2, v6, v3
-; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2
-; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v4, v3, v8
-; GFX6-NEXT: v_add_f32_e32 v2, v2, v7
-; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX6-NEXT: v_lshlrev_b32_e32 v2, v6, v2
-; GFX6-NEXT: v_or_b32_e32 v2, v4, v2
; GFX6-NEXT: v_mov_b32_e32 v5, v3
-; GFX6-NEXT: v_mov_b32_e32 v4, v2
-; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_add_f32_e32 v4, v5, v2
+; GFX6-NEXT: v_mov_b32_e32 v3, v4
+; GFX6-NEXT: v_mov_b32_e32 v4, v5
+; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT: v_mov_b32_e32 v3, v4
; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_cbranch_execnz .LBB30_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
-; GFX6-NEXT: v_lshrrev_b32_e32 v0, v6, v4
-; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX6-NEXT: v_mov_b32_e32 v0, v3
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
- %gep = getelementptr half, ptr addrspace(1) %ptr, i64 1023
- %result = atomicrmw fadd ptr addrspace(1) %gep, half %val seq_cst
- ret half %result
-}
+ %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511
+ %result = atomicrmw fadd ptr addrspace(1) %gep, float %val seq_cst, !amdgpu.no.fine.grained.memory !0
+ ret float %result
+}
-define void @global_system_atomic_fadd_noret_f16__offset12b_pos(ptr addrspace(1) %ptr, half %val) #0 {
-; GFX12-LABEL: global_system_atomic_fadd_noret_f16__offset12b_pos:
+define void @global_system_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #1 {
+; GFX12-LABEL: global_system_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: s_wait_expcnt 0x0
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
-; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX12-NEXT: global_load_b32 v4, v[0:1], off offset:2044
; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX12-NEXT: global_load_b32 v4, v[0:1], off
-; GFX12-NEXT: v_lshlrev_b32_e32 v5, 3, v3
-; GFX12-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_not_b32_e32 v6, v3
; GFX12-NEXT: .LBB31_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_lshrrev_b32_e32 v3, v5, v4
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_add_f16_e32 v3, v3, v2
-; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3
-; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3
+; GFX12-NEXT: v_add_f32_e32 v3, v4, v2
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN
+; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_SYS
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
@@ -7103,66 +5285,26 @@ define void @global_system_atomic_fadd_noret_f16__offset12b_pos(ptr addrspace(1)
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
-; GFX940-LABEL: global_system_atomic_fadd_noret_f16__offset12b_pos:
+; GFX940-LABEL: global_system_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: s_mov_b64 s[0:1], 0x7fe
-; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
-; GFX940-NEXT: v_and_b32_e32 v0, -4, v4
-; GFX940-NEXT: v_mov_b32_e32 v1, v5
-; GFX940-NEXT: global_load_dword v5, v[0:1], off
-; GFX940-NEXT: v_and_b32_e32 v3, 3, v4
-; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3
-; GFX940-NEXT: s_mov_b32 s0, 0xffff
-; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0
-; GFX940-NEXT: v_not_b32_e32 v6, v4
-; GFX940-NEXT: s_mov_b64 s[0:1], 0
-; GFX940-NEXT: .LBB31_1: ; %atomicrmw.start
-; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: v_lshrrev_b32_e32 v4, v3, v5
-; GFX940-NEXT: v_add_f16_e32 v4, v4, v2
-; GFX940-NEXT: v_lshlrev_b32_e32 v4, v3, v4
-; GFX940-NEXT: v_and_or_b32 v4, v5, v6, v4
; GFX940-NEXT: buffer_wbl2 sc0 sc1
-; GFX940-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off sc0 sc1
+; GFX940-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc0 sc1
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5
-; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT: v_mov_b32_e32 v5, v4
-; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT: s_cbranch_execnz .LBB31_1
-; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX940-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: global_system_atomic_fadd_noret_f16__offset12b_pos:
+; GFX11-LABEL: global_system_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX11-NEXT: global_load_b32 v4, v[0:1], off offset:2044
; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX11-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX11-NEXT: global_load_b32 v4, v[0:1], off
-; GFX11-NEXT: v_lshlrev_b32_e32 v5, 3, v3
-; GFX11-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_not_b32_e32 v6, v3
; GFX11-NEXT: .LBB31_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, v5, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_f16_e32 v3, v3, v2
-; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, v5, v3
-; GFX11-NEXT: v_and_or_b32 v3, v4, v6, v3
+; GFX11-NEXT: v_add_f32_e32 v3, v4, v2
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc
+; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
@@ -7176,27 +5318,17 @@ define void @global_system_atomic_fadd_noret_f16__offset12b_pos(ptr addrspace(1)
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
-; GFX10-LABEL: global_system_atomic_fadd_noret_f16__offset12b_pos:
+; GFX10-LABEL: global_system_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
-; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:2044
; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX10-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX10-NEXT: global_load_dword v4, v[0:1], off
-; GFX10-NEXT: v_lshlrev_b32_e32 v5, 3, v3
-; GFX10-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
-; GFX10-NEXT: v_not_b32_e32 v6, v3
; GFX10-NEXT: .LBB31_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_lshrrev_b32_e32 v3, v5, v4
-; GFX10-NEXT: v_add_f16_e32 v3, v3, v2
-; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-; GFX10-NEXT: v_and_or_b32 v3, v4, v6, v3
+; GFX10-NEXT: v_add_f32_e32 v3, v4, v2
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
+; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
@@ -7209,61 +5341,39 @@ define void @global_system_atomic_fadd_noret_f16__offset12b_pos(ptr addrspace(1)
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX90A-LABEL: global_system_atomic_fadd_noret_f16__offset12b_pos:
+; GFX90A-LABEL: global_system_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0
-; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX90A-NEXT: global_load_dword v5, v[0:1], off
-; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3
-; GFX90A-NEXT: s_mov_b32 s4, 0xffff
-; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4
-; GFX90A-NEXT: v_not_b32_e32 v6, v4
+; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:2044
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: .LBB31_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v5
-; GFX90A-NEXT: v_add_f16_e32 v4, v4, v2
-; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4
-; GFX90A-NEXT: v_and_or_b32 v4, v5, v6, v4
+; GFX90A-NEXT: v_add_f32_e32 v4, v5, v2
; GFX90A-NEXT: buffer_wbl2
-; GFX90A-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off glc
+; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2044 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_invl2
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: v_mov_b32_e32 v5, v4
+; GFX90A-NEXT: v_mov_b32_e32 v5, v3
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_cbranch_execnz .LBB31_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
-; GFX908-LABEL: global_system_atomic_fadd_noret_f16__offset12b_pos:
+; GFX908-LABEL: global_system_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0
-; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX908-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX908-NEXT: global_load_dword v4, v[0:1], off
-; GFX908-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX908-NEXT: v_lshlrev_b32_e32 v5, 3, v3
-; GFX908-NEXT: s_mov_b32 s4, 0xffff
-; GFX908-NEXT: v_lshlrev_b32_e64 v3, v5, s4
-; GFX908-NEXT: v_not_b32_e32 v6, v3
+; GFX908-NEXT: global_load_dword v4, v[0:1], off offset:2044
; GFX908-NEXT: s_mov_b64 s[4:5], 0
; GFX908-NEXT: .LBB31_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_lshrrev_b32_e32 v3, v5, v4
-; GFX908-NEXT: v_add_f16_e32 v3, v3, v2
-; GFX908-NEXT: v_lshlrev_b32_e32 v3, v5, v3
-; GFX908-NEXT: v_and_or_b32 v3, v4, v6, v3
-; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
+; GFX908-NEXT: v_add_f32_e32 v3, v4, v2
+; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
@@ -7275,27 +5385,17 @@ define void @global_system_atomic_fadd_noret_f16__offset12b_pos(ptr addrspace(1)
; GFX908-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
-; GFX8-LABEL: global_system_atomic_fadd_noret_f16__offset12b_pos:
+; GFX8-LABEL: global_system_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fe, v0
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: v_and_b32_e32 v0, -4, v3
; GFX8-NEXT: flat_load_dword v4, v[0:1]
-; GFX8-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX8-NEXT: v_lshlrev_b32_e32 v5, 3, v3
-; GFX8-NEXT: s_mov_b32 s4, 0xffff
-; GFX8-NEXT: v_lshlrev_b32_e64 v3, v5, s4
-; GFX8-NEXT: v_not_b32_e32 v6, v3
; GFX8-NEXT: s_mov_b64 s[4:5], 0
; GFX8-NEXT: .LBB31_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_lshrrev_b32_e32 v3, v5, v4
-; GFX8-NEXT: v_add_f16_e32 v3, v3, v2
-; GFX8-NEXT: v_and_b32_e32 v7, v4, v6
-; GFX8-NEXT: v_lshlrev_b32_e32 v3, v5, v3
-; GFX8-NEXT: v_or_b32_e32 v3, v7, v3
+; GFX8-NEXT: v_add_f32_e32 v3, v4, v2
; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
@@ -7308,1461 +5408,605 @@ define void @global_system_atomic_fadd_noret_f16__offset12b_pos(ptr addrspace(1)
; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX7-LABEL: global_system_atomic_fadd_noret_f16__offset12b_pos:
+; GFX7-LABEL: global_system_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0
; GFX7-NEXT: s_mov_b32 s6, 0
-; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX7-NEXT: v_and_b32_e32 v0, -4, v4
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
-; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX7-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:2044
; GFX7-NEXT: s_mov_b64 s[8:9], 0
-; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2
-; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v4
-; GFX7-NEXT: v_not_b32_e32 v6, v2
; GFX7-NEXT: .LBB31_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3
-; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
-; GFX7-NEXT: v_and_b32_e32 v7, v3, v6
-; GFX7-NEXT: v_add_f32_e32 v2, v2, v5
-; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX7-NEXT: v_lshlrev_b32_e32 v2, v4, v2
-; GFX7-NEXT: v_or_b32_e32 v2, v7, v2
-; GFX7-NEXT: v_mov_b32_e32 v8, v3
-; GFX7-NEXT: v_mov_b32_e32 v7, v2
-; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT: v_add_f32_e32 v3, v4, v2
+; GFX7-NEXT: v_mov_b32_e32 v6, v4
+; GFX7-NEXT: v_mov_b32_e32 v5, v3
+; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2044 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4
; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT: v_mov_b32_e32 v3, v7
+; GFX7-NEXT: v_mov_b32_e32 v4, v5
; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_cbranch_execnz .LBB31_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
-; GFX6-LABEL: global_system_atomic_fadd_noret_f16__offset12b_pos:
+; GFX6-LABEL: global_system_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0
; GFX6-NEXT: s_mov_b32 s6, 0
-; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX6-NEXT: v_and_b32_e32 v0, -4, v4
; GFX6-NEXT: s_mov_b32 s7, 0xf000
; GFX6-NEXT: s_mov_b32 s4, s6
; GFX6-NEXT: s_mov_b32 s5, s6
-; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX6-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX6-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:2044
; GFX6-NEXT: s_mov_b64 s[8:9], 0
-; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v2
-; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v4
-; GFX6-NEXT: v_not_b32_e32 v6, v2
; GFX6-NEXT: .LBB31_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_lshrrev_b32_e32 v2, v4, v3
-; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GFX6-NEXT: v_add_f32_e32 v3, v4, v2
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v7, v3, v6
-; GFX6-NEXT: v_add_f32_e32 v2, v2, v5
-; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX6-NEXT: v_lshlrev_b32_e32 v2, v4, v2
-; GFX6-NEXT: v_or_b32_e32 v2, v7, v2
-; GFX6-NEXT: v_mov_b32_e32 v8, v3
-; GFX6-NEXT: v_mov_b32_e32 v7, v2
-; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT: v_mov_b32_e32 v6, v4
+; GFX6-NEXT: v_mov_b32_e32 v5, v3
+; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2044 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4
; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT: v_mov_b32_e32 v3, v7
+; GFX6-NEXT: v_mov_b32_e32 v4, v5
; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_cbranch_execnz .LBB31_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
- %gep = getelementptr half, ptr addrspace(1) %ptr, i64 1023
- %unused = atomicrmw fadd ptr addrspace(1) %gep, half %val seq_cst
+ %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511
+ %unused = atomicrmw fadd ptr addrspace(1) %gep, float %val seq_cst, !amdgpu.no.fine.grained.memory !0
ret void
}
-; --------------------------------------------------------------------
-; bfloat
-; --------------------------------------------------------------------
-
-define bfloat @global_agent_atomic_fadd_ret_bf16(ptr addrspace(1) %ptr, bfloat %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fadd_ret_bf16:
+define float @global_agent_atomic_fadd_ret_f32__offset12b_pos__ieee__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode(ptr addrspace(1) %ptr, float %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos__ieee__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: s_wait_expcnt 0x0
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX12-NEXT: global_load_b32 v5, v[0:1], off
-; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3
-; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_not_b32_e32 v4, v4
-; GFX12-NEXT: .LBB32_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v6, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6
-; GFX12-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_add_f32_e32 v5, v5, v2
-; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v5
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN
+; GFX12-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB32_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
-; GFX940-LABEL: global_agent_atomic_fadd_ret_bf16:
+; GFX940-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos__ieee__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_mov_b32_e32 v3, v0
-; GFX940-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX940-NEXT: global_load_dword v5, v[0:1], off
-; GFX940-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3
-; GFX940-NEXT: s_mov_b32 s0, 0xffff
-; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0
-; GFX940-NEXT: v_not_b32_e32 v4, v4
-; GFX940-NEXT: s_mov_b64 s[0:1], 0
-; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX940-NEXT: s_movk_i32 s2, 0x7fff
-; GFX940-NEXT: .LBB32_1: ; %atomicrmw.start
-; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: v_mov_b32_e32 v7, v5
-; GFX940-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_add_f32_e32 v5, v5, v2
-; GFX940-NEXT: v_bfe_u32 v6, v5, 16, 1
-; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v5
-; GFX940-NEXT: v_add3_u32 v6, v6, v5, s2
-; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX940-NEXT: s_nop 1
-; GFX940-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc
-; GFX940-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX940-NEXT: v_and_or_b32 v6, v7, v4, v5
-; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: global_atomic_cmpswap v5, v[0:1], v[6:7], off sc0
+; GFX940-NEXT: buffer_wbl2 sc1
+; GFX940-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:2044 sc0
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7
-; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT: s_cbranch_execnz .LBB32_1
-; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX940-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v5
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: global_agent_atomic_fadd_ret_bf16:
+; GFX11-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos__ieee__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX11-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX11-NEXT: global_load_b32 v5, v[0:1], off
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3
-; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_not_b32_e32 v4, v4
-; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB32_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v6, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6
-; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_f32_e32 v5, v5, v2
-; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v5
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_and_or_b32 v5, v6, v4, v5
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc
+; GFX11-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:2044 glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB32_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
-; GFX10-LABEL: global_agent_atomic_fadd_ret_bf16:
+; GFX10-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos__ieee__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v3, v0
-; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044
; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX10-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX10-NEXT: global_load_dword v5, v[0:1], off
-; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3
-; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX10-NEXT: v_not_b32_e32 v4, v4
; GFX10-NEXT: .LBB32_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v6, v5
-; GFX10-NEXT: v_lshrrev_b32_sdwa v5, v3, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX10-NEXT: v_add_f32_e32 v5, v5, v2
-; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v5
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
-; GFX10-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX10-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX10-NEXT: v_mov_b32_e32 v4, v3
+; GFX10-NEXT: v_add_f32_e32 v3, v4, v2
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: global_atomic_cmpswap v5, v[0:1], v[5:6], off glc
+; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_cbranch_execnz .LBB32_1
; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX10-NEXT: v_mov_b32_e32 v0, v3
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX90A-LABEL: global_agent_atomic_fadd_ret_bf16:
+; GFX90A-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos__ieee__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v3, v0
-; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX90A-NEXT: global_load_dword v5, v[0:1], off
-; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3
-; GFX90A-NEXT: s_mov_b32 s4, 0xffff
-; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4
-; GFX90A-NEXT: v_not_b32_e32 v4, v4
-; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX90A-NEXT: s_movk_i32 s6, 0x7fff
-; GFX90A-NEXT: .LBB32_1: ; %atomicrmw.start
-; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v7, v5
-; GFX90A-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX90A-NEXT: v_add_f32_e32 v5, v5, v2
-; GFX90A-NEXT: v_bfe_u32 v6, v5, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v5
-; GFX90A-NEXT: v_add3_u32 v6, v6, v5, s6
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX90A-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc
-; GFX90A-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX90A-NEXT: v_and_or_b32 v6, v7, v4, v5
-; GFX90A-NEXT: global_atomic_cmpswap v5, v[0:1], v[6:7], off glc
+; GFX90A-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:2044 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7
-; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: s_cbranch_execnz .LBB32_1
-; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
-; GFX908-LABEL: global_agent_atomic_fadd_ret_bf16:
+; GFX908-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos__ieee__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v3, v0
-; GFX908-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX908-NEXT: global_load_dword v5, v[0:1], off
-; GFX908-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX908-NEXT: v_lshlrev_b32_e32 v3, 3, v3
-; GFX908-NEXT: s_mov_b32 s4, 0xffff
-; GFX908-NEXT: v_lshlrev_b32_e64 v4, v3, s4
-; GFX908-NEXT: v_not_b32_e32 v4, v4
+; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044
; GFX908-NEXT: s_mov_b64 s[4:5], 0
-; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX908-NEXT: s_movk_i32 s6, 0x7fff
; GFX908-NEXT: .LBB32_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v6, v5
-; GFX908-NEXT: v_lshrrev_b32_sdwa v5, v3, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX908-NEXT: v_add_f32_e32 v5, v5, v2
-; GFX908-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v5
-; GFX908-NEXT: v_add3_u32 v7, v7, v5, s6
-; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX908-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc
-; GFX908-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX908-NEXT: v_and_or_b32 v5, v6, v4, v5
-; GFX908-NEXT: global_atomic_cmpswap v5, v[0:1], v[5:6], off glc
+; GFX908-NEXT: v_mov_b32_e32 v4, v3
+; GFX908-NEXT: v_add_f32_e32 v3, v4, v2
+; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB32_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX908-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX908-NEXT: v_mov_b32_e32 v0, v3
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
-; GFX8-LABEL: global_agent_atomic_fadd_ret_bf16:
+; GFX8-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos__ieee__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v3, v0
-; GFX8-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX8-NEXT: flat_load_dword v5, v[0:1]
-; GFX8-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX8-NEXT: v_lshlrev_b32_e32 v3, 3, v3
-; GFX8-NEXT: s_mov_b32 s4, 0xffff
-; GFX8-NEXT: v_lshlrev_b32_e64 v4, v3, s4
-; GFX8-NEXT: v_not_b32_e32 v4, v4
+; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fc, v0
+; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc
+; GFX8-NEXT: flat_load_dword v0, v[3:4]
; GFX8-NEXT: s_mov_b64 s[4:5], 0
-; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX8-NEXT: .LBB32_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v6, v5
-; GFX8-NEXT: v_lshrrev_b32_sdwa v5, v3, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT: v_add_f32_e32 v5, v5, v2
-; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5
-; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8
-; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX8-NEXT: v_and_b32_e32 v7, v6, v4
-; GFX8-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-NEXT: v_or_b32_e32 v5, v7, v5
-; GFX8-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
+; GFX8-NEXT: v_mov_b32_e32 v1, v0
+; GFX8-NEXT: v_add_f32_e32 v0, v1, v2
+; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB32_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, v3, v5
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX7-LABEL: global_agent_atomic_fadd_ret_bf16:
+; GFX7-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos__ieee__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v3, v0
; GFX7-NEXT: s_mov_b32 s6, 0
-; GFX7-NEXT: v_and_b32_e32 v0, -4, v3
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
-; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX7-NEXT: v_lshlrev_b32_e32 v5, 3, v3
-; GFX7-NEXT: v_lshl_b32_e32 v3, 0xffff, v5
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT: v_not_b32_e32 v6, v3
+; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044
; GFX7-NEXT: s_mov_b64 s[8:9], 0
-; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v2
; GFX7-NEXT: .LBB32_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v2, v5, v4
-; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX7-NEXT: v_add_f32_e32 v2, v2, v7
-; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX7-NEXT: v_and_b32_e32 v3, v4, v6
-; GFX7-NEXT: v_lshlrev_b32_e32 v2, v5, v2
-; GFX7-NEXT: v_or_b32_e32 v3, v3, v2
-; GFX7-NEXT: v_mov_b32_e32 v2, v3
+; GFX7-NEXT: v_mov_b32_e32 v5, v3
+; GFX7-NEXT: v_add_f32_e32 v4, v5, v2
; GFX7-NEXT: v_mov_b32_e32 v3, v4
-; GFX7-NEXT: buffer_atomic_cmpswap v[2:3], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT: v_mov_b32_e32 v4, v5
+; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT: v_mov_b32_e32 v4, v2
; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_cbranch_execnz .LBB32_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, v5, v2
-; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_mov_b32_e32 v0, v3
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
-; GFX6-LABEL: global_agent_atomic_fadd_ret_bf16:
+; GFX6-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos__ieee__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v3, v0
; GFX6-NEXT: s_mov_b32 s6, 0
-; GFX6-NEXT: v_and_b32_e32 v0, -4, v3
; GFX6-NEXT: s_mov_b32 s7, 0xf000
; GFX6-NEXT: s_mov_b32 s4, s6
; GFX6-NEXT: s_mov_b32 s5, s6
-; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64
-; GFX6-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX6-NEXT: v_lshlrev_b32_e32 v5, 3, v3
-; GFX6-NEXT: v_lshl_b32_e32 v3, 0xffff, v5
-; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX6-NEXT: v_not_b32_e32 v6, v3
+; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2044
; GFX6-NEXT: s_mov_b64 s[8:9], 0
-; GFX6-NEXT: v_and_b32_e32 v7, 0xffff0000, v2
; GFX6-NEXT: .LBB32_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0)
-; GFX6-NEXT: v_lshrrev_b32_e32 v2, v5, v4
-; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX6-NEXT: v_add_f32_e32 v2, v2, v7
-; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX6-NEXT: v_and_b32_e32 v3, v4, v6
-; GFX6-NEXT: v_lshlrev_b32_e32 v2, v5, v2
-; GFX6-NEXT: v_or_b32_e32 v3, v3, v2
-; GFX6-NEXT: v_mov_b32_e32 v2, v3
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v5, v3
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_add_f32_e32 v4, v5, v2
; GFX6-NEXT: v_mov_b32_e32 v3, v4
-; GFX6-NEXT: buffer_atomic_cmpswap v[2:3], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT: v_mov_b32_e32 v4, v5
+; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2044 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT: v_mov_b32_e32 v4, v2
; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_cbranch_execnz .LBB32_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
-; GFX6-NEXT: v_lshrrev_b32_e32 v0, v5, v2
-; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX6-NEXT: v_mov_b32_e32 v0, v3
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
- %result = atomicrmw fadd ptr addrspace(1) %ptr, bfloat %val syncscope("agent") seq_cst
- ret bfloat %result
+ %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511
+ %result = atomicrmw fadd ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0
+ ret float %result
}
-define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_pos(ptr addrspace(1) %ptr, bfloat %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos:
+define void @global_agent_atomic_fadd_noret_f32__offset12b_pos__ieee__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode(ptr addrspace(1) %ptr, float %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos__ieee__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: s_wait_expcnt 0x0
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
-; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
-; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: global_load_b32 v5, v[0:1], off
-; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3
-; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_not_b32_e32 v4, v4
-; GFX12-NEXT: .LBB33_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v6, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6
-; GFX12-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_add_f32_e32 v5, v5, v2
-; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v5
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN
-; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044
+; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB33_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
-; GFX940-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos:
+; GFX940-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos__ieee__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: s_mov_b64 s[0:1], 0x7fe
-; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
-; GFX940-NEXT: v_and_b32_e32 v0, -4, v4
-; GFX940-NEXT: v_mov_b32_e32 v1, v5
-; GFX940-NEXT: global_load_dword v5, v[0:1], off
-; GFX940-NEXT: v_and_b32_e32 v3, 3, v4
-; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3
-; GFX940-NEXT: s_mov_b32 s0, 0xffff
-; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0
-; GFX940-NEXT: v_not_b32_e32 v4, v4
-; GFX940-NEXT: s_mov_b64 s[0:1], 0
-; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX940-NEXT: s_movk_i32 s2, 0x7fff
-; GFX940-NEXT: .LBB33_1: ; %atomicrmw.start
-; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: v_mov_b32_e32 v7, v5
-; GFX940-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_add_f32_e32 v5, v5, v2
-; GFX940-NEXT: v_bfe_u32 v6, v5, 16, 1
-; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v5
-; GFX940-NEXT: v_add3_u32 v6, v6, v5, s2
-; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX940-NEXT: s_nop 1
-; GFX940-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc
-; GFX940-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX940-NEXT: v_and_or_b32 v6, v7, v4, v5
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: global_atomic_cmpswap v5, v[0:1], v[6:7], off sc0
+; GFX940-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7
-; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT: s_cbranch_execnz .LBB33_1
-; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX940-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v5
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos:
+; GFX11-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos__ieee__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX11-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: global_load_b32 v5, v[0:1], off
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3
-; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_not_b32_e32 v4, v4
-; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB33_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v6, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6
-; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_f32_e32 v5, v5, v2
-; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v5
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_and_or_b32 v5, v6, v4, v5
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB33_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
-; GFX10-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos:
+; GFX10-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos__ieee__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
-; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
-; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX10-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX10-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:2044
; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: global_load_dword v5, v[0:1], off
-; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3
-; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX10-NEXT: v_not_b32_e32 v4, v4
; GFX10-NEXT: .LBB33_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v6, v5
-; GFX10-NEXT: v_lshrrev_b32_sdwa v5, v3, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX10-NEXT: v_add_f32_e32 v5, v5, v2
-; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v5
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
-; GFX10-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX10-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX10-NEXT: v_add_f32_e32 v3, v4, v2
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: global_atomic_cmpswap v5, v[0:1], v[5:6], off glc
+; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX10-NEXT: v_mov_b32_e32 v4, v3
; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_cbranch_execnz .LBB33_1
; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: v_lshrrev_b32_e32 v0, v3, v5
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX90A-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos:
+; GFX90A-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos__ieee__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0
-; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX90A-NEXT: global_load_dword v5, v[0:1], off
-; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3
-; GFX90A-NEXT: s_mov_b32 s4, 0xffff
-; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4
-; GFX90A-NEXT: v_not_b32_e32 v4, v4
-; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX90A-NEXT: s_movk_i32 s6, 0x7fff
-; GFX90A-NEXT: .LBB33_1: ; %atomicrmw.start
-; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v7, v5
-; GFX90A-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX90A-NEXT: v_add_f32_e32 v5, v5, v2
-; GFX90A-NEXT: v_bfe_u32 v6, v5, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v5
-; GFX90A-NEXT: v_add3_u32 v6, v6, v5, s6
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX90A-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc
-; GFX90A-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX90A-NEXT: v_and_or_b32 v6, v7, v4, v5
-; GFX90A-NEXT: global_atomic_cmpswap v5, v[0:1], v[6:7], off glc
+; GFX90A-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7
-; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: s_cbranch_execnz .LBB33_1
-; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
-; GFX908-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos:
+; GFX908-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos__ieee__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0
-; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX908-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX908-NEXT: global_load_dword v5, v[0:1], off
-; GFX908-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX908-NEXT: v_lshlrev_b32_e32 v3, 3, v3
-; GFX908-NEXT: s_mov_b32 s4, 0xffff
-; GFX908-NEXT: v_lshlrev_b32_e64 v4, v3, s4
-; GFX908-NEXT: v_not_b32_e32 v4, v4
-; GFX908-NEXT: s_mov_b64 s[4:5], 0
-; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX908-NEXT: s_movk_i32 s6, 0x7fff
-; GFX908-NEXT: .LBB33_1: ; %atomicrmw.start
-; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v6, v5
-; GFX908-NEXT: v_lshrrev_b32_sdwa v5, v3, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX908-NEXT: v_add_f32_e32 v5, v5, v2
-; GFX908-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v5
-; GFX908-NEXT: v_add3_u32 v7, v7, v5, s6
-; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX908-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc
-; GFX908-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX908-NEXT: v_and_or_b32 v5, v6, v4, v5
-; GFX908-NEXT: global_atomic_cmpswap v5, v[0:1], v[5:6], off glc
+; GFX908-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6
-; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT: s_cbranch_execnz .LBB33_1
-; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX908-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v5
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
-; GFX8-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos:
+; GFX8-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos__ieee__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fe, v0
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX8-NEXT: flat_load_dword v5, v[0:1]
-; GFX8-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX8-NEXT: v_lshlrev_b32_e32 v3, 3, v3
-; GFX8-NEXT: s_mov_b32 s4, 0xffff
-; GFX8-NEXT: v_lshlrev_b32_e64 v4, v3, s4
-; GFX8-NEXT: v_not_b32_e32 v4, v4
+; GFX8-NEXT: flat_load_dword v4, v[0:1]
; GFX8-NEXT: s_mov_b64 s[4:5], 0
-; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX8-NEXT: .LBB33_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v6, v5
-; GFX8-NEXT: v_lshrrev_b32_sdwa v5, v3, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT: v_add_f32_e32 v5, v5, v2
-; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5
-; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8
-; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX8-NEXT: v_and_b32_e32 v7, v6, v4
-; GFX8-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-NEXT: v_or_b32_e32 v5, v7, v5
-; GFX8-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
+; GFX8-NEXT: v_add_f32_e32 v3, v4, v2
+; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT: v_mov_b32_e32 v4, v3
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB33_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, v3, v5
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX7-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos:
+; GFX7-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos__ieee__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0
; GFX7-NEXT: s_mov_b32 s6, 0
-; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX7-NEXT: v_and_b32_e32 v0, -4, v4
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
-; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX7-NEXT: v_lshlrev_b32_e32 v6, 3, v4
-; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v6
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT: v_not_b32_e32 v7, v4
+; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:2044
; GFX7-NEXT: s_mov_b64 s[8:9], 0
-; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v2
; GFX7-NEXT: .LBB33_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v2, v6, v3
-; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX7-NEXT: v_add_f32_e32 v2, v2, v8
-; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX7-NEXT: v_and_b32_e32 v4, v3, v7
-; GFX7-NEXT: v_lshlrev_b32_e32 v2, v6, v2
-; GFX7-NEXT: v_or_b32_e32 v2, v4, v2
+; GFX7-NEXT: v_add_f32_e32 v3, v4, v2
+; GFX7-NEXT: v_mov_b32_e32 v6, v4
; GFX7-NEXT: v_mov_b32_e32 v5, v3
-; GFX7-NEXT: v_mov_b32_e32 v4, v2
-; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2044 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4
; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT: v_mov_b32_e32 v3, v4
+; GFX7-NEXT: v_mov_b32_e32 v4, v5
; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_cbranch_execnz .LBB33_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, v6, v4
-; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
-; GFX6-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos:
+; GFX6-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos__ieee__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0
; GFX6-NEXT: s_mov_b32 s6, 0
-; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX6-NEXT: v_and_b32_e32 v0, -4, v4
; GFX6-NEXT: s_mov_b32 s7, 0xf000
; GFX6-NEXT: s_mov_b32 s4, s6
; GFX6-NEXT: s_mov_b32 s5, s6
-; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX6-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX6-NEXT: v_lshlrev_b32_e32 v6, 3, v4
-; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v6
-; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX6-NEXT: v_not_b32_e32 v7, v4
+; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64 offset:2044
; GFX6-NEXT: s_mov_b64 s[8:9], 0
-; GFX6-NEXT: v_and_b32_e32 v8, 0xffff0000, v2
; GFX6-NEXT: .LBB33_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_lshrrev_b32_e32 v2, v6, v3
-; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX6-NEXT: v_add_f32_e32 v2, v2, v8
-; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX6-NEXT: v_add_f32_e32 v3, v4, v2
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v4, v3, v7
-; GFX6-NEXT: v_lshlrev_b32_e32 v2, v6, v2
-; GFX6-NEXT: v_or_b32_e32 v2, v4, v2
+; GFX6-NEXT: v_mov_b32_e32 v6, v4
; GFX6-NEXT: v_mov_b32_e32 v5, v3
-; GFX6-NEXT: v_mov_b32_e32 v4, v2
-; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2044 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4
; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT: v_mov_b32_e32 v3, v4
+; GFX6-NEXT: v_mov_b32_e32 v4, v5
; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_cbranch_execnz .LBB33_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
-; GFX6-NEXT: v_lshrrev_b32_e32 v0, v6, v4
-; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
- %gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 1023
- %result = atomicrmw fadd ptr addrspace(1) %gep, bfloat %val syncscope("agent") seq_cst
- ret bfloat %result
+ %gep = getelementptr float, ptr addrspace(1) %ptr, i64 511
+ %unused = atomicrmw fadd ptr addrspace(1) %gep, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0
+ ret void
}
-define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_neg(ptr addrspace(1) %ptr, bfloat %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_neg:
+define float @global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %val) #1 {
+; GFX12-LABEL: global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_remote_memory:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: s_wait_expcnt 0x0
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
-; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
-; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: global_load_b32 v5, v[0:1], off
-; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3
-; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_not_b32_e32 v4, v4
-; GFX12-NEXT: .LBB34_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v6, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6
-; GFX12-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_add_f32_e32 v5, v5, v2
-; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v5
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN
+; GFX12-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB34_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
-; GFX940-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_neg:
+; GFX940-LABEL: global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_remote_memory:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: s_movk_i32 s0, 0xf800
-; GFX940-NEXT: s_mov_b32 s1, -1
-; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
-; GFX940-NEXT: v_and_b32_e32 v0, -4, v4
-; GFX940-NEXT: v_mov_b32_e32 v1, v5
-; GFX940-NEXT: global_load_dword v5, v[0:1], off
-; GFX940-NEXT: v_and_b32_e32 v3, 3, v4
-; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3
-; GFX940-NEXT: s_mov_b32 s0, 0xffff
-; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0
-; GFX940-NEXT: v_not_b32_e32 v4, v4
-; GFX940-NEXT: s_mov_b64 s[0:1], 0
-; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX940-NEXT: s_movk_i32 s2, 0x7fff
-; GFX940-NEXT: .LBB34_1: ; %atomicrmw.start
-; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: v_mov_b32_e32 v7, v5
-; GFX940-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_add_f32_e32 v5, v5, v2
-; GFX940-NEXT: v_bfe_u32 v6, v5, 16, 1
-; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v5
-; GFX940-NEXT: v_add3_u32 v6, v6, v5, s2
-; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX940-NEXT: s_nop 1
-; GFX940-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc
-; GFX940-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX940-NEXT: v_and_or_b32 v6, v7, v4, v5
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: global_atomic_cmpswap v5, v[0:1], v[6:7], off sc0
+; GFX940-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off sc0
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7
-; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT: s_cbranch_execnz .LBB34_1
-; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX940-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v5
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_neg:
+; GFX11-LABEL: global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_remote_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX11-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: global_load_b32 v5, v[0:1], off
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3
-; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_not_b32_e32 v4, v4
-; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB34_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v6, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6
-; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_f32_e32 v5, v5, v2
-; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v5
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_and_or_b32 v5, v6, v4, v5
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc
+; GFX11-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB34_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
-; GFX10-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_neg:
+; GFX10-LABEL: global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_remote_memory:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
-; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
-; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX10-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX10-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX10-NEXT: global_load_dword v3, v[0:1], off
; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: global_load_dword v5, v[0:1], off
-; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3
-; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX10-NEXT: v_not_b32_e32 v4, v4
; GFX10-NEXT: .LBB34_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v6, v5
-; GFX10-NEXT: v_lshrrev_b32_sdwa v5, v3, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX10-NEXT: v_add_f32_e32 v5, v5, v2
-; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v5
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
-; GFX10-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX10-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX10-NEXT: v_mov_b32_e32 v4, v3
+; GFX10-NEXT: v_add_f32_e32 v3, v4, v2
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: global_atomic_cmpswap v5, v[0:1], v[5:6], off glc
+; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_cbranch_execnz .LBB34_1
; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX10-NEXT: v_mov_b32_e32 v0, v3
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX90A-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_neg:
+; GFX90A-LABEL: global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_remote_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
-; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
-; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX90A-NEXT: global_load_dword v5, v[0:1], off
-; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3
-; GFX90A-NEXT: s_mov_b32 s4, 0xffff
-; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4
-; GFX90A-NEXT: v_not_b32_e32 v4, v4
-; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX90A-NEXT: s_movk_i32 s6, 0x7fff
-; GFX90A-NEXT: .LBB34_1: ; %atomicrmw.start
-; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v7, v5
-; GFX90A-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX90A-NEXT: v_add_f32_e32 v5, v5, v2
-; GFX90A-NEXT: v_bfe_u32 v6, v5, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v5
-; GFX90A-NEXT: v_add3_u32 v6, v6, v5, s6
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX90A-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc
-; GFX90A-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX90A-NEXT: v_and_or_b32 v6, v7, v4, v5
-; GFX90A-NEXT: global_atomic_cmpswap v5, v[0:1], v[6:7], off glc
+; GFX90A-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7
-; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: s_cbranch_execnz .LBB34_1
-; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
-; GFX908-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_neg:
+; GFX908-LABEL: global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_remote_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
-; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
-; GFX908-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX908-NEXT: global_load_dword v5, v[0:1], off
-; GFX908-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX908-NEXT: v_lshlrev_b32_e32 v3, 3, v3
-; GFX908-NEXT: s_mov_b32 s4, 0xffff
-; GFX908-NEXT: v_lshlrev_b32_e64 v4, v3, s4
-; GFX908-NEXT: v_not_b32_e32 v4, v4
+; GFX908-NEXT: global_load_dword v3, v[0:1], off
; GFX908-NEXT: s_mov_b64 s[4:5], 0
-; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX908-NEXT: s_movk_i32 s6, 0x7fff
; GFX908-NEXT: .LBB34_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v6, v5
-; GFX908-NEXT: v_lshrrev_b32_sdwa v5, v3, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX908-NEXT: v_add_f32_e32 v5, v5, v2
-; GFX908-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v5
-; GFX908-NEXT: v_add3_u32 v7, v7, v5, s6
-; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX908-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc
-; GFX908-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX908-NEXT: v_and_or_b32 v5, v6, v4, v5
-; GFX908-NEXT: global_atomic_cmpswap v5, v[0:1], v[5:6], off glc
+; GFX908-NEXT: v_mov_b32_e32 v4, v3
+; GFX908-NEXT: v_add_f32_e32 v3, v4, v2
+; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB34_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX908-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX908-NEXT: v_mov_b32_e32 v0, v3
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
-; GFX8-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_neg:
+; GFX8-LABEL: global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_remote_memory:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0xfffff800, v0
-; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc
-; GFX8-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX8-NEXT: flat_load_dword v5, v[0:1]
-; GFX8-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX8-NEXT: v_lshlrev_b32_e32 v3, 3, v3
-; GFX8-NEXT: s_mov_b32 s4, 0xffff
-; GFX8-NEXT: v_lshlrev_b32_e64 v4, v3, s4
-; GFX8-NEXT: v_not_b32_e32 v4, v4
+; GFX8-NEXT: flat_load_dword v3, v[0:1]
; GFX8-NEXT: s_mov_b64 s[4:5], 0
-; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX8-NEXT: .LBB34_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v6, v5
-; GFX8-NEXT: v_lshrrev_b32_sdwa v5, v3, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT: v_add_f32_e32 v5, v5, v2
-; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5
-; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8
-; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX8-NEXT: v_and_b32_e32 v7, v6, v4
-; GFX8-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-NEXT: v_or_b32_e32 v5, v7, v5
-; GFX8-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
+; GFX8-NEXT: v_mov_b32_e32 v4, v3
+; GFX8-NEXT: v_add_f32_e32 v3, v4, v2
+; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB34_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX8-NEXT: v_mov_b32_e32 v0, v3
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX7-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_neg:
+; GFX7-LABEL: global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_remote_memory:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0xfffff800, v0
; GFX7-NEXT: s_mov_b32 s6, 0
-; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc
-; GFX7-NEXT: v_and_b32_e32 v0, -4, v4
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX7-NEXT: v_lshlrev_b32_e32 v6, 3, v4
-; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v6
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT: v_not_b32_e32 v7, v4
; GFX7-NEXT: s_mov_b64 s[8:9], 0
-; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v2
; GFX7-NEXT: .LBB34_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v2, v6, v3
-; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX7-NEXT: v_add_f32_e32 v2, v2, v8
-; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX7-NEXT: v_and_b32_e32 v4, v3, v7
-; GFX7-NEXT: v_lshlrev_b32_e32 v2, v6, v2
-; GFX7-NEXT: v_or_b32_e32 v2, v4, v2
; GFX7-NEXT: v_mov_b32_e32 v5, v3
-; GFX7-NEXT: v_mov_b32_e32 v4, v2
-; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT: v_add_f32_e32 v4, v5, v2
+; GFX7-NEXT: v_mov_b32_e32 v3, v4
+; GFX7-NEXT: v_mov_b32_e32 v4, v5
+; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT: v_mov_b32_e32 v3, v4
; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_cbranch_execnz .LBB34_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, v6, v4
-; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_mov_b32_e32 v0, v3
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
-; GFX6-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_neg:
+; GFX6-LABEL: global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_remote_memory:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0xfffff800, v0
; GFX6-NEXT: s_mov_b32 s6, 0
-; GFX6-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc
-; GFX6-NEXT: v_and_b32_e32 v0, -4, v4
; GFX6-NEXT: s_mov_b32 s7, 0xf000
; GFX6-NEXT: s_mov_b32 s4, s6
; GFX6-NEXT: s_mov_b32 s5, s6
; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX6-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX6-NEXT: v_lshlrev_b32_e32 v6, 3, v4
-; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v6
-; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX6-NEXT: v_not_b32_e32 v7, v4
; GFX6-NEXT: s_mov_b64 s[8:9], 0
-; GFX6-NEXT: v_and_b32_e32 v8, 0xffff0000, v2
; GFX6-NEXT: .LBB34_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_lshrrev_b32_e32 v2, v6, v3
-; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX6-NEXT: v_add_f32_e32 v2, v2, v8
-; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v4, v3, v7
-; GFX6-NEXT: v_lshlrev_b32_e32 v2, v6, v2
-; GFX6-NEXT: v_or_b32_e32 v2, v4, v2
; GFX6-NEXT: v_mov_b32_e32 v5, v3
-; GFX6-NEXT: v_mov_b32_e32 v4, v2
-; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_add_f32_e32 v4, v5, v2
+; GFX6-NEXT: v_mov_b32_e32 v3, v4
+; GFX6-NEXT: v_mov_b32_e32 v4, v5
+; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT: v_mov_b32_e32 v3, v4
; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_cbranch_execnz .LBB34_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
-; GFX6-NEXT: v_lshrrev_b32_e32 v0, v6, v4
-; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX6-NEXT: v_mov_b32_e32 v0, v3
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
- %gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 -1024
- %result = atomicrmw fadd ptr addrspace(1) %gep, bfloat %val syncscope("agent") seq_cst
- ret bfloat %result
- }
+ %result = atomicrmw fadd ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
+ ret float %result
+}
-define void @global_agent_atomic_fadd_noret_bf16(ptr addrspace(1) %ptr, bfloat %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fadd_noret_bf16:
+define void @global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %val) #1 {
+; GFX12-LABEL: global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_remote_memory:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: s_wait_expcnt 0x0
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX12-NEXT: global_load_b32 v4, v[0:1], off
-; GFX12-NEXT: v_lshlrev_b32_e32 v5, 3, v3
-; GFX12-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_not_b32_e32 v6, v3
-; GFX12-NEXT: .LBB35_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_lshrrev_b32_e32 v3, v5, v4
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX12-NEXT: v_add_f32_e32 v3, v3, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-NEXT: v_bfe_u32 v7, v3, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc_lo
-; GFX12-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3
-; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN
-; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: global_atomic_add_f32 v[0:1], v2, off
+; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX12-NEXT: v_mov_b32_e32 v4, v3
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB35_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
-; GFX940-LABEL: global_agent_atomic_fadd_noret_bf16:
+; GFX940-LABEL: global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_remote_memory:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_mov_b32_e32 v3, v0
-; GFX940-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX940-NEXT: global_load_dword v5, v[0:1], off
-; GFX940-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3
-; GFX940-NEXT: s_mov_b32 s0, 0xffff
-; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0
-; GFX940-NEXT: v_not_b32_e32 v6, v4
-; GFX940-NEXT: s_mov_b64 s[0:1], 0
-; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX940-NEXT: s_movk_i32 s2, 0x7fff
-; GFX940-NEXT: .LBB35_1: ; %atomicrmw.start
-; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: v_lshrrev_b32_sdwa v4, v3, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_add_f32_e32 v4, v4, v2
-; GFX940-NEXT: v_bfe_u32 v7, v4, 16, 1
-; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v4
-; GFX940-NEXT: v_add3_u32 v7, v7, v4, s2
-; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
-; GFX940-NEXT: s_nop 1
-; GFX940-NEXT: v_cndmask_b32_e32 v4, v7, v8, vcc
-; GFX940-NEXT: v_lshlrev_b32_sdwa v4, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX940-NEXT: v_and_or_b32 v4, v5, v6, v4
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off sc0
+; GFX940-NEXT: global_atomic_add_f32 v[0:1], v2, off
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5
-; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT: v_mov_b32_e32 v5, v4
-; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT: s_cbranch_execnz .LBB35_1
-; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX940-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: global_agent_atomic_fadd_noret_bf16:
+; GFX11-LABEL: global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_remote_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX11-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX11-NEXT: global_load_b32 v4, v[0:1], off
-; GFX11-NEXT: v_lshlrev_b32_e32 v5, 3, v3
-; GFX11-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_not_b32_e32 v6, v3
-; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB35_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, v5, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-NEXT: v_add_f32_e32 v3, v3, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc_lo
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, v5, v3
-; GFX11-NEXT: v_and_or_b32 v3, v4, v6, v3
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: global_atomic_add_f32 v[0:1], v2, off
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT: v_mov_b32_e32 v4, v3
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB35_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
-; GFX10-LABEL: global_agent_atomic_fadd_noret_bf16:
+; GFX10-LABEL: global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_remote_memory:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v3, v0
-; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX10-NEXT: v_and_b32_e32 v3, 3, v3
; GFX10-NEXT: global_load_dword v4, v[0:1], off
-; GFX10-NEXT: v_lshlrev_b32_e32 v5, 3, v3
-; GFX10-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
-; GFX10-NEXT: v_not_b32_e32 v6, v3
+; GFX10-NEXT: s_mov_b32 s4, 0
; GFX10-NEXT: .LBB35_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_lshrrev_b32_sdwa v3, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX10-NEXT: v_add_f32_e32 v3, v3, v2
-; GFX10-NEXT: v_bfe_u32 v7, v3, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX10-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
-; GFX10-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc_lo
-; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX10-NEXT: v_and_or_b32 v3, v4, v6, v3
+; GFX10-NEXT: v_add_f32_e32 v3, v4, v2
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
@@ -8777,109 +6021,31 @@ define void @global_agent_atomic_fadd_noret_bf16(ptr addrspace(1) %ptr, bfloat %
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX90A-LABEL: global_agent_atomic_fadd_noret_bf16:
+; GFX90A-LABEL: global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_remote_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v3, v0
-; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX90A-NEXT: global_load_dword v5, v[0:1], off
-; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3
-; GFX90A-NEXT: s_mov_b32 s4, 0xffff
-; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4
-; GFX90A-NEXT: v_not_b32_e32 v6, v4
-; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX90A-NEXT: s_movk_i32 s6, 0x7fff
-; GFX90A-NEXT: .LBB35_1: ; %atomicrmw.start
-; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_lshrrev_b32_sdwa v4, v3, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX90A-NEXT: v_add_f32_e32 v4, v4, v2
-; GFX90A-NEXT: v_bfe_u32 v7, v4, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v4
-; GFX90A-NEXT: v_add3_u32 v7, v7, v4, s6
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
-; GFX90A-NEXT: v_cndmask_b32_e32 v4, v7, v8, vcc
-; GFX90A-NEXT: v_lshlrev_b32_sdwa v4, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX90A-NEXT: v_and_or_b32 v4, v5, v6, v4
-; GFX90A-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off glc
+; GFX90A-NEXT: global_atomic_add_f32 v[0:1], v2, off
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5
-; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: v_mov_b32_e32 v5, v4
-; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: s_cbranch_execnz .LBB35_1
-; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
-; GFX908-LABEL: global_agent_atomic_fadd_noret_bf16:
+; GFX908-LABEL: global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_remote_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v3, v0
-; GFX908-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX908-NEXT: global_load_dword v4, v[0:1], off
-; GFX908-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX908-NEXT: v_lshlrev_b32_e32 v5, 3, v3
-; GFX908-NEXT: s_mov_b32 s4, 0xffff
-; GFX908-NEXT: v_lshlrev_b32_e64 v3, v5, s4
-; GFX908-NEXT: v_not_b32_e32 v6, v3
-; GFX908-NEXT: s_mov_b64 s[4:5], 0
-; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX908-NEXT: s_movk_i32 s6, 0x7fff
-; GFX908-NEXT: .LBB35_1: ; %atomicrmw.start
-; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_lshrrev_b32_sdwa v3, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX908-NEXT: v_add_f32_e32 v3, v3, v2
-; GFX908-NEXT: v_bfe_u32 v7, v3, 16, 1
-; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX908-NEXT: v_add3_u32 v7, v7, v3, s6
-; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; GFX908-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc
-; GFX908-NEXT: v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX908-NEXT: v_and_or_b32 v3, v4, v6, v3
-; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
+; GFX908-NEXT: global_atomic_add_f32 v[0:1], v2, off
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
-; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT: v_mov_b32_e32 v4, v3
-; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT: s_cbranch_execnz .LBB35_1
-; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX908-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
-; GFX8-LABEL: global_agent_atomic_fadd_noret_bf16:
+; GFX8-LABEL: global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_remote_memory:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v3, v0
-; GFX8-NEXT: v_and_b32_e32 v0, -4, v3
; GFX8-NEXT: flat_load_dword v4, v[0:1]
-; GFX8-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX8-NEXT: v_lshlrev_b32_e32 v5, 3, v3
-; GFX8-NEXT: s_mov_b32 s4, 0xffff
-; GFX8-NEXT: v_lshlrev_b32_e64 v3, v5, s4
-; GFX8-NEXT: v_not_b32_e32 v6, v3
; GFX8-NEXT: s_mov_b64 s[4:5], 0
-; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX8-NEXT: .LBB35_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_lshrrev_b32_sdwa v3, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT: v_add_f32_e32 v3, v3, v2
-; GFX8-NEXT: v_bfe_u32 v8, v3, 16, 1
-; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v3
-; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8
-; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v3
-; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; GFX8-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc
-; GFX8-NEXT: v_and_b32_e32 v7, v4, v6
-; GFX8-NEXT: v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-NEXT: v_or_b32_e32 v3, v7, v3
+; GFX8-NEXT: v_add_f32_e32 v3, v4, v2
; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
@@ -8892,661 +6058,284 @@ define void @global_agent_atomic_fadd_noret_bf16(ptr addrspace(1) %ptr, bfloat %
; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX7-LABEL: global_agent_atomic_fadd_noret_bf16:
+; GFX7-LABEL: global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_remote_memory:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v3, v0
; GFX7-NEXT: s_mov_b32 s6, 0
-; GFX7-NEXT: v_and_b32_e32 v0, -4, v3
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX7-NEXT: v_lshlrev_b32_e32 v5, 3, v3
-; GFX7-NEXT: v_lshl_b32_e32 v3, 0xffff, v5
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT: v_not_b32_e32 v6, v3
; GFX7-NEXT: s_mov_b64 s[8:9], 0
-; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX7-NEXT: .LBB35_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v3, v5, v4
-; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX7-NEXT: v_add_f32_e32 v3, v3, v2
-; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX7-NEXT: v_and_b32_e32 v7, v4, v6
-; GFX7-NEXT: v_lshlrev_b32_e32 v3, v5, v3
-; GFX7-NEXT: v_or_b32_e32 v3, v7, v3
-; GFX7-NEXT: v_mov_b32_e32 v8, v4
-; GFX7-NEXT: v_mov_b32_e32 v7, v3
-; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT: v_add_f32_e32 v3, v4, v2
+; GFX7-NEXT: v_mov_b32_e32 v6, v4
+; GFX7-NEXT: v_mov_b32_e32 v5, v3
+; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v4
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4
; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT: v_mov_b32_e32 v4, v7
+; GFX7-NEXT: v_mov_b32_e32 v4, v5
; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_cbranch_execnz .LBB35_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
-; GFX6-LABEL: global_agent_atomic_fadd_noret_bf16:
+; GFX6-LABEL: global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_remote_memory:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v3, v0
; GFX6-NEXT: s_mov_b32 s6, 0
-; GFX6-NEXT: v_and_b32_e32 v0, -4, v3
; GFX6-NEXT: s_mov_b32 s7, 0xf000
; GFX6-NEXT: s_mov_b32 s4, s6
; GFX6-NEXT: s_mov_b32 s5, s6
; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64
-; GFX6-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX6-NEXT: v_lshlrev_b32_e32 v5, 3, v3
-; GFX6-NEXT: v_lshl_b32_e32 v3, 0xffff, v5
-; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX6-NEXT: v_not_b32_e32 v6, v3
; GFX6-NEXT: s_mov_b64 s[8:9], 0
-; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX6-NEXT: .LBB35_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_lshrrev_b32_e32 v3, v5, v4
-; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX6-NEXT: v_add_f32_e32 v3, v3, v2
-; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX6-NEXT: v_add_f32_e32 v3, v4, v2
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v7, v4, v6
-; GFX6-NEXT: v_lshlrev_b32_e32 v3, v5, v3
-; GFX6-NEXT: v_or_b32_e32 v3, v7, v3
-; GFX6-NEXT: v_mov_b32_e32 v8, v4
-; GFX6-NEXT: v_mov_b32_e32 v7, v3
-; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT: v_mov_b32_e32 v6, v4
+; GFX6-NEXT: v_mov_b32_e32 v5, v3
+; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v4
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4
; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT: v_mov_b32_e32 v4, v7
+; GFX6-NEXT: v_mov_b32_e32 v4, v5
; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_cbranch_execnz .LBB35_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
- %unused = atomicrmw fadd ptr addrspace(1) %ptr, bfloat %val syncscope("agent") seq_cst
+ %unused = atomicrmw fadd ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
ret void
}
-define void @global_agent_atomic_fadd_noret_bf16__offset12b_pos(ptr addrspace(1) %ptr, bfloat %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_pos:
+define float @global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %val) #1 {
+; GFX12-LABEL: global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: s_wait_expcnt 0x0
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
-; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
-; GFX12-NEXT: v_lshlrev_b32_e32 v6, 16, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_b32_e32 v0, -4, v4
-; GFX12-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: global_load_b32 v3, v[0:1], off
-; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4
-; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_not_b32_e32 v5, v5
-; GFX12-NEXT: .LBB36_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_lshrrev_b32_e32 v2, v4, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX12-NEXT: v_add_f32_e32 v2, v2, v6
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo
-; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2
-; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN
+; GFX12-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-NEXT: v_mov_b32_e32 v3, v2
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB36_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
-; GFX940-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_pos:
+; GFX940-LABEL: global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: s_mov_b64 s[0:1], 0x7fe
-; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
-; GFX940-NEXT: v_and_b32_e32 v0, -4, v4
-; GFX940-NEXT: v_mov_b32_e32 v1, v5
-; GFX940-NEXT: global_load_dword v3, v[0:1], off
-; GFX940-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX940-NEXT: v_lshlrev_b32_e32 v4, 3, v4
-; GFX940-NEXT: s_mov_b32 s0, 0xffff
-; GFX940-NEXT: v_lshlrev_b32_e64 v5, v4, s0
-; GFX940-NEXT: v_not_b32_e32 v5, v5
-; GFX940-NEXT: s_mov_b64 s[0:1], 0
-; GFX940-NEXT: v_lshlrev_b32_e32 v6, 16, v2
-; GFX940-NEXT: s_movk_i32 s2, 0x7fff
-; GFX940-NEXT: .LBB36_1: ; %atomicrmw.start
-; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_add_f32_e32 v2, v2, v6
-; GFX940-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX940-NEXT: v_add3_u32 v7, v7, v2, s2
-; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX940-NEXT: s_nop 1
-; GFX940-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc
-; GFX940-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX940-NEXT: v_and_or_b32 v2, v3, v5, v2
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0
+; GFX940-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off sc0
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT: v_mov_b32_e32 v3, v2
-; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT: s_cbranch_execnz .LBB36_1
-; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX940-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_pos:
+; GFX11-LABEL: global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
-; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v0, -4, v4
-; GFX11-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: global_load_b32 v3, v[0:1], off
-; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v4
-; GFX11-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_not_b32_e32 v5, v5
-; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB36_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, v4, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-NEXT: v_add_f32_e32 v2, v2, v6
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, v4, v2
-; GFX11-NEXT: v_and_or_b32 v2, v3, v5, v2
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc
+; GFX11-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-NEXT: v_mov_b32_e32 v3, v2
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB36_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
-; GFX10-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_pos:
+; GFX10-LABEL: global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
-; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
-; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v2
-; GFX10-NEXT: v_and_b32_e32 v0, -4, v4
-; GFX10-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX10-NEXT: s_mov_b32 s4, 0
; GFX10-NEXT: global_load_dword v3, v[0:1], off
-; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v4
-; GFX10-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX10-NEXT: v_not_b32_e32 v5, v5
+; GFX10-NEXT: s_mov_b32 s4, 0
; GFX10-NEXT: .LBB36_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX10-NEXT: v_add_f32_e32 v2, v2, v6
-; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX10-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
-; GFX10-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo
-; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX10-NEXT: v_and_or_b32 v2, v3, v5, v2
+; GFX10-NEXT: v_mov_b32_e32 v4, v3
+; GFX10-NEXT: v_add_f32_e32 v3, v4, v2
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
+; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX10-NEXT: v_mov_b32_e32 v3, v2
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_cbranch_execnz .LBB36_1
; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT: v_mov_b32_e32 v0, v3
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX90A-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_pos:
+; GFX90A-LABEL: global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0
-; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX90A-NEXT: v_and_b32_e32 v0, -4, v4
-; GFX90A-NEXT: global_load_dword v3, v[0:1], off
-; GFX90A-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 3, v4
-; GFX90A-NEXT: s_mov_b32 s4, 0xffff
-; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v4, s4
-; GFX90A-NEXT: v_not_b32_e32 v5, v5
-; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v2
-; GFX90A-NEXT: s_movk_i32 s6, 0x7fff
-; GFX90A-NEXT: .LBB36_1: ; %atomicrmw.start
-; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX90A-NEXT: v_add_f32_e32 v2, v2, v6
-; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s6
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX90A-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc
-; GFX90A-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX90A-NEXT: v_and_or_b32 v2, v3, v5, v2
-; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
+; GFX90A-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: v_mov_b32_e32 v3, v2
-; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: s_cbranch_execnz .LBB36_1
-; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
-; GFX908-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_pos:
+; GFX908-LABEL: global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0
-; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX908-NEXT: v_and_b32_e32 v0, -4, v4
; GFX908-NEXT: global_load_dword v3, v[0:1], off
-; GFX908-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX908-NEXT: v_lshlrev_b32_e32 v4, 3, v4
-; GFX908-NEXT: s_mov_b32 s4, 0xffff
-; GFX908-NEXT: v_lshlrev_b32_e64 v5, v4, s4
-; GFX908-NEXT: v_not_b32_e32 v5, v5
; GFX908-NEXT: s_mov_b64 s[4:5], 0
-; GFX908-NEXT: v_lshlrev_b32_e32 v6, 16, v2
-; GFX908-NEXT: s_movk_i32 s6, 0x7fff
; GFX908-NEXT: .LBB36_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX908-NEXT: v_add_f32_e32 v2, v2, v6
-; GFX908-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX908-NEXT: v_add3_u32 v7, v7, v2, s6
-; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX908-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc
-; GFX908-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX908-NEXT: v_and_or_b32 v2, v3, v5, v2
-; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
+; GFX908-NEXT: v_mov_b32_e32 v4, v3
+; GFX908-NEXT: v_add_f32_e32 v3, v4, v2
+; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT: v_mov_b32_e32 v3, v2
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB36_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX908-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT: v_mov_b32_e32 v0, v3
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
-; GFX8-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_pos:
+; GFX8-LABEL: global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fe, v0
-; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: v_and_b32_e32 v0, -4, v4
; GFX8-NEXT: flat_load_dword v3, v[0:1]
-; GFX8-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX8-NEXT: v_lshlrev_b32_e32 v4, 3, v4
-; GFX8-NEXT: s_mov_b32 s4, 0xffff
-; GFX8-NEXT: v_lshlrev_b32_e64 v5, v4, s4
-; GFX8-NEXT: v_not_b32_e32 v5, v5
; GFX8-NEXT: s_mov_b64 s[4:5], 0
-; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v2
; GFX8-NEXT: .LBB36_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT: v_add_f32_e32 v2, v2, v6
-; GFX8-NEXT: v_bfe_u32 v8, v2, 16, 1
-; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v2
-; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8
-; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v2
-; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX8-NEXT: v_cndmask_b32_e32 v2, v8, v9, vcc
-; GFX8-NEXT: v_and_b32_e32 v7, v3, v5
-; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-NEXT: v_or_b32_e32 v2, v7, v2
-; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX8-NEXT: v_mov_b32_e32 v4, v3
+; GFX8-NEXT: v_add_f32_e32 v3, v4, v2
+; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT: v_mov_b32_e32 v3, v2
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB36_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT: v_mov_b32_e32 v0, v3
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX7-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_pos:
+; GFX7-LABEL: global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0
; GFX7-NEXT: s_mov_b32 s6, 0
-; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX7-NEXT: v_and_b32_e32 v0, -4, v4
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4
-; GFX7-NEXT: v_lshl_b32_e32 v5, 0xffff, v4
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT: v_not_b32_e32 v5, v5
; GFX7-NEXT: s_mov_b64 s[8:9], 0
-; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
; GFX7-NEXT: .LBB36_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3
-; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX7-NEXT: v_add_f32_e32 v2, v2, v6
-; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX7-NEXT: v_and_b32_e32 v7, v3, v5
-; GFX7-NEXT: v_lshlrev_b32_e32 v2, v4, v2
-; GFX7-NEXT: v_or_b32_e32 v2, v7, v2
-; GFX7-NEXT: v_mov_b32_e32 v8, v3
-; GFX7-NEXT: v_mov_b32_e32 v7, v2
-; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT: v_mov_b32_e32 v5, v3
+; GFX7-NEXT: v_add_f32_e32 v4, v5, v2
+; GFX7-NEXT: v_mov_b32_e32 v3, v4
+; GFX7-NEXT: v_mov_b32_e32 v4, v5
+; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT: v_mov_b32_e32 v3, v7
; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_cbranch_execnz .LBB36_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
+; GFX7-NEXT: v_mov_b32_e32 v0, v3
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
-; GFX6-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_pos:
+; GFX6-LABEL: global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0
; GFX6-NEXT: s_mov_b32 s6, 0
-; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX6-NEXT: v_and_b32_e32 v0, -4, v4
; GFX6-NEXT: s_mov_b32 s7, 0xf000
; GFX6-NEXT: s_mov_b32 s4, s6
; GFX6-NEXT: s_mov_b32 s5, s6
; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX6-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX6-NEXT: v_lshlrev_b32_e32 v4, 3, v4
-; GFX6-NEXT: v_lshl_b32_e32 v5, 0xffff, v4
-; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX6-NEXT: v_not_b32_e32 v5, v5
; GFX6-NEXT: s_mov_b64 s[8:9], 0
-; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
; GFX6-NEXT: .LBB36_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_lshrrev_b32_e32 v2, v4, v3
-; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX6-NEXT: v_add_f32_e32 v2, v2, v6
-; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX6-NEXT: v_mov_b32_e32 v5, v3
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v7, v3, v5
-; GFX6-NEXT: v_lshlrev_b32_e32 v2, v4, v2
-; GFX6-NEXT: v_or_b32_e32 v2, v7, v2
-; GFX6-NEXT: v_mov_b32_e32 v8, v3
-; GFX6-NEXT: v_mov_b32_e32 v7, v2
-; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT: v_add_f32_e32 v4, v5, v2
+; GFX6-NEXT: v_mov_b32_e32 v3, v4
+; GFX6-NEXT: v_mov_b32_e32 v4, v5
+; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT: v_mov_b32_e32 v3, v7
; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_cbranch_execnz .LBB36_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
+; GFX6-NEXT: v_mov_b32_e32 v0, v3
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
- %gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 1023
- %unused = atomicrmw fadd ptr addrspace(1) %gep, bfloat %val syncscope("agent") seq_cst
- ret void
+ %result = atomicrmw fadd ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
+ ret float %result
}
-define void @global_agent_atomic_fadd_noret_bf16__offset12b_neg(ptr addrspace(1) %ptr, bfloat %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_neg:
+define void @global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %val) #1 {
+; GFX12-LABEL: global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: s_wait_expcnt 0x0
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
-; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
-; GFX12-NEXT: v_lshlrev_b32_e32 v6, 16, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_b32_e32 v0, -4, v4
-; GFX12-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: global_load_b32 v3, v[0:1], off
-; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4
-; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_not_b32_e32 v5, v5
-; GFX12-NEXT: .LBB37_1: ; %atomicrmw.start
-; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_lshrrev_b32_e32 v2, v4, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX12-NEXT: v_add_f32_e32 v2, v2, v6
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo
-; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2
-; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN
-; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: global_atomic_add_f32 v[0:1], v2, off
+; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-NEXT: v_mov_b32_e32 v3, v2
-; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: s_cbranch_execnz .LBB37_1
-; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
-; GFX940-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_neg:
+; GFX940-LABEL: global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: s_movk_i32 s0, 0xf800
-; GFX940-NEXT: s_mov_b32 s1, -1
-; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
-; GFX940-NEXT: v_and_b32_e32 v0, -4, v4
-; GFX940-NEXT: v_mov_b32_e32 v1, v5
-; GFX940-NEXT: global_load_dword v3, v[0:1], off
-; GFX940-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX940-NEXT: v_lshlrev_b32_e32 v4, 3, v4
-; GFX940-NEXT: s_mov_b32 s0, 0xffff
-; GFX940-NEXT: v_lshlrev_b32_e64 v5, v4, s0
-; GFX940-NEXT: v_not_b32_e32 v5, v5
-; GFX940-NEXT: s_mov_b64 s[0:1], 0
-; GFX940-NEXT: v_lshlrev_b32_e32 v6, 16, v2
-; GFX940-NEXT: s_movk_i32 s2, 0x7fff
-; GFX940-NEXT: .LBB37_1: ; %atomicrmw.start
-; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_add_f32_e32 v2, v2, v6
-; GFX940-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX940-NEXT: v_add3_u32 v7, v7, v2, s2
-; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX940-NEXT: s_nop 1
-; GFX940-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc
-; GFX940-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX940-NEXT: v_and_or_b32 v2, v3, v5, v2
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0
+; GFX940-NEXT: global_atomic_add_f32 v[0:1], v2, off
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT: v_mov_b32_e32 v3, v2
-; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT: s_cbranch_execnz .LBB37_1
-; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX940-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_neg:
+; GFX11-LABEL: global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
-; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v0, -4, v4
-; GFX11-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: global_load_b32 v3, v[0:1], off
-; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v4
-; GFX11-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_not_b32_e32 v5, v5
-; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB37_1: ; %atomicrmw.start
-; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, v4, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-NEXT: v_add_f32_e32 v2, v2, v6
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, v4, v2
-; GFX11-NEXT: v_and_or_b32 v2, v3, v5, v2
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: global_atomic_add_f32 v[0:1], v2, off
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-NEXT: v_mov_b32_e32 v3, v2
-; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: s_cbranch_execnz .LBB37_1
-; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
-; GFX10-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_neg:
+; GFX10-LABEL: global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
-; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
-; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v2
-; GFX10-NEXT: v_and_b32_e32 v0, -4, v4
-; GFX10-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX10-NEXT: global_load_dword v4, v[0:1], off
; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: global_load_dword v3, v[0:1], off
-; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v4
-; GFX10-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX10-NEXT: v_not_b32_e32 v5, v5
; GFX10-NEXT: .LBB37_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX10-NEXT: v_add_f32_e32 v2, v2, v6
-; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX10-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
-; GFX10-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo
-; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX10-NEXT: v_and_or_b32 v2, v3, v5, v2
+; GFX10-NEXT: v_add_f32_e32 v3, v4, v2
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
+; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX10-NEXT: v_mov_b32_e32 v3, v2
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX10-NEXT: v_mov_b32_e32 v4, v3
; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_cbranch_execnz .LBB37_1
@@ -9554,1298 +6343,796 @@ define void @global_agent_atomic_fadd_noret_bf16__offset12b_neg(ptr addrspace(1)
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX90A-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_neg:
+; GFX90A-LABEL: global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
-; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
-; GFX90A-NEXT: v_and_b32_e32 v0, -4, v4
-; GFX90A-NEXT: global_load_dword v3, v[0:1], off
-; GFX90A-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 3, v4
-; GFX90A-NEXT: s_mov_b32 s4, 0xffff
-; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v4, s4
-; GFX90A-NEXT: v_not_b32_e32 v5, v5
-; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v2
-; GFX90A-NEXT: s_movk_i32 s6, 0x7fff
-; GFX90A-NEXT: .LBB37_1: ; %atomicrmw.start
-; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX90A-NEXT: v_add_f32_e32 v2, v2, v6
-; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s6
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX90A-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc
-; GFX90A-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX90A-NEXT: v_and_or_b32 v2, v3, v5, v2
-; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
+; GFX90A-NEXT: global_atomic_add_f32 v[0:1], v2, off
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: v_mov_b32_e32 v3, v2
-; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: s_cbranch_execnz .LBB37_1
-; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
-; GFX908-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_neg:
+; GFX908-LABEL: global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
-; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
-; GFX908-NEXT: v_and_b32_e32 v0, -4, v4
-; GFX908-NEXT: global_load_dword v3, v[0:1], off
-; GFX908-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX908-NEXT: v_lshlrev_b32_e32 v4, 3, v4
-; GFX908-NEXT: s_mov_b32 s4, 0xffff
-; GFX908-NEXT: v_lshlrev_b32_e64 v5, v4, s4
-; GFX908-NEXT: v_not_b32_e32 v5, v5
-; GFX908-NEXT: s_mov_b64 s[4:5], 0
-; GFX908-NEXT: v_lshlrev_b32_e32 v6, 16, v2
-; GFX908-NEXT: s_movk_i32 s6, 0x7fff
-; GFX908-NEXT: .LBB37_1: ; %atomicrmw.start
-; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX908-NEXT: v_add_f32_e32 v2, v2, v6
-; GFX908-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX908-NEXT: v_add3_u32 v7, v7, v2, s6
-; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX908-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc
-; GFX908-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX908-NEXT: v_and_or_b32 v2, v3, v5, v2
-; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
+; GFX908-NEXT: global_atomic_add_f32 v[0:1], v2, off
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT: v_mov_b32_e32 v3, v2
-; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX908-NEXT: s_cbranch_execnz .LBB37_1
-; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX908-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
-; GFX8-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_neg:
+; GFX8-LABEL: global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0xfffff800, v0
-; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc
-; GFX8-NEXT: v_and_b32_e32 v0, -4, v4
-; GFX8-NEXT: flat_load_dword v3, v[0:1]
-; GFX8-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX8-NEXT: v_lshlrev_b32_e32 v4, 3, v4
-; GFX8-NEXT: s_mov_b32 s4, 0xffff
-; GFX8-NEXT: v_lshlrev_b32_e64 v5, v4, s4
-; GFX8-NEXT: v_not_b32_e32 v5, v5
+; GFX8-NEXT: flat_load_dword v4, v[0:1]
; GFX8-NEXT: s_mov_b64 s[4:5], 0
-; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v2
; GFX8-NEXT: .LBB37_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT: v_add_f32_e32 v2, v2, v6
-; GFX8-NEXT: v_bfe_u32 v8, v2, 16, 1
-; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v2
-; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8
-; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v2
-; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX8-NEXT: v_cndmask_b32_e32 v2, v8, v9, vcc
-; GFX8-NEXT: v_and_b32_e32 v7, v3, v5
-; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-NEXT: v_or_b32_e32 v2, v7, v2
-; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX8-NEXT: v_add_f32_e32 v3, v4, v2
+; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT: v_mov_b32_e32 v3, v2
+; GFX8-NEXT: v_mov_b32_e32 v4, v3
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB37_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX7-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_neg:
+; GFX7-LABEL: global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0xfffff800, v0
; GFX7-NEXT: s_mov_b32 s6, 0
-; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc
-; GFX7-NEXT: v_and_b32_e32 v0, -4, v4
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
-; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4
-; GFX7-NEXT: v_lshl_b32_e32 v5, 0xffff, v4
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT: v_not_b32_e32 v5, v5
+; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64
; GFX7-NEXT: s_mov_b64 s[8:9], 0
-; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
; GFX7-NEXT: .LBB37_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3
-; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX7-NEXT: v_add_f32_e32 v2, v2, v6
-; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX7-NEXT: v_and_b32_e32 v7, v3, v5
-; GFX7-NEXT: v_lshlrev_b32_e32 v2, v4, v2
-; GFX7-NEXT: v_or_b32_e32 v2, v7, v2
-; GFX7-NEXT: v_mov_b32_e32 v8, v3
-; GFX7-NEXT: v_mov_b32_e32 v7, v2
-; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT: v_add_f32_e32 v3, v4, v2
+; GFX7-NEXT: v_mov_b32_e32 v6, v4
+; GFX7-NEXT: v_mov_b32_e32 v5, v3
+; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4
; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT: v_mov_b32_e32 v3, v7
+; GFX7-NEXT: v_mov_b32_e32 v4, v5
; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_cbranch_execnz .LBB37_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
-; GFX6-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_neg:
+; GFX6-LABEL: global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0xfffff800, v0
; GFX6-NEXT: s_mov_b32 s6, 0
-; GFX6-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc
-; GFX6-NEXT: v_and_b32_e32 v0, -4, v4
; GFX6-NEXT: s_mov_b32 s7, 0xf000
; GFX6-NEXT: s_mov_b32 s4, s6
; GFX6-NEXT: s_mov_b32 s5, s6
-; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX6-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX6-NEXT: v_lshlrev_b32_e32 v4, 3, v4
-; GFX6-NEXT: v_lshl_b32_e32 v5, 0xffff, v4
-; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX6-NEXT: v_not_b32_e32 v5, v5
+; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64
; GFX6-NEXT: s_mov_b64 s[8:9], 0
-; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
; GFX6-NEXT: .LBB37_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_lshrrev_b32_e32 v2, v4, v3
-; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX6-NEXT: v_add_f32_e32 v2, v2, v6
-; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX6-NEXT: v_add_f32_e32 v3, v4, v2
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v7, v3, v5
-; GFX6-NEXT: v_lshlrev_b32_e32 v2, v4, v2
-; GFX6-NEXT: v_or_b32_e32 v2, v7, v2
-; GFX6-NEXT: v_mov_b32_e32 v8, v3
-; GFX6-NEXT: v_mov_b32_e32 v7, v2
-; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT: v_mov_b32_e32 v6, v4
+; GFX6-NEXT: v_mov_b32_e32 v5, v3
+; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4
; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT: v_mov_b32_e32 v3, v7
+; GFX6-NEXT: v_mov_b32_e32 v4, v5
; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_cbranch_execnz .LBB37_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
- %gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 -1024
- %unused = atomicrmw fadd ptr addrspace(1) %gep, bfloat %val syncscope("agent") seq_cst
+ %unused = atomicrmw fadd ptr addrspace(1) %ptr, float %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
ret void
}
-define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_pos__align4(ptr addrspace(1) %ptr, bfloat %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos__align4:
+; --------------------------------------------------------------------
+; double
+; --------------------------------------------------------------------
+
+define double @global_agent_atomic_fadd_ret_f64__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, double %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fadd_ret_f64__amdgpu_no_fine_grained_memory:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: s_wait_expcnt 0x0
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2046
-; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off
; GFX12-NEXT: s_mov_b32 s0, 0
; GFX12-NEXT: .LBB38_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v4, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v4
-; GFX12-NEXT: v_add_f32_e32 v3, v3, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v6, 0x400000, v3
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX12-NEXT: v_add3_u32 v5, v5, v3, 0x7fff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo
-; GFX12-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
+; GFX12-NEXT: v_add_f64_e32 v[4:5], v[6:7], v[2:3]
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 th:TH_ATOMIC_RETURN
+; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB38_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
-; GFX940-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos__align4:
+; GFX940-LABEL: global_agent_atomic_fadd_ret_f64__amdgpu_no_fine_grained_memory:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2046
-; GFX940-NEXT: s_mov_b64 s[0:1], 0
-; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX940-NEXT: s_movk_i32 s2, 0x7fff
-; GFX940-NEXT: s_mov_b32 s3, 0xffff0000
-; GFX940-NEXT: .LBB38_1: ; %atomicrmw.start
-; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: v_mov_b32_e32 v5, v3
-; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v5
-; GFX940-NEXT: v_add_f32_e32 v3, v3, v2
-; GFX940-NEXT: v_bfe_u32 v4, v3, 16, 1
-; GFX940-NEXT: v_or_b32_e32 v6, 0x400000, v3
-; GFX940-NEXT: v_add3_u32 v4, v4, v3, s2
-; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; GFX940-NEXT: s_nop 1
-; GFX940-NEXT: v_cndmask_b32_e32 v3, v4, v6, vcc
-; GFX940-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX940-NEXT: v_and_or_b32 v4, v5, s3, v3
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2046 sc0
+; GFX940-NEXT: global_atomic_add_f64 v[0:1], v[0:1], v[2:3], off sc0
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
-; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT: s_cbranch_execnz .LBB38_1
-; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX940-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX940-NEXT: v_mov_b32_e32 v0, v3
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos__align4:
+; GFX11-LABEL: global_agent_atomic_fadd_ret_f64__amdgpu_no_fine_grained_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2046
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off
; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: .p2align 6
; GFX11-NEXT: .LBB38_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v4, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v4
-; GFX11-NEXT: v_add_f32_e32 v3, v3, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v3
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX11-NEXT: v_add3_u32 v5, v5, v3, 0x7fff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo
-; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
+; GFX11-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3]
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 glc
+; GFX11-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: s_cbranch_execnz .LBB38_1
; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: v_mov_b32_e32 v0, v3
+; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
-; GFX10-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos__align4:
+; GFX10-LABEL: global_agent_atomic_fadd_ret_f64__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2046
-; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX10-NEXT: global_load_dwordx2 v[4:5], v[0:1], off
; GFX10-NEXT: s_mov_b32 s4, 0
; GFX10-NEXT: .LBB38_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v4, v3
-; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v4
-; GFX10-NEXT: v_add_f32_e32 v3, v3, v2
-; GFX10-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v3
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
-; GFX10-NEXT: v_add3_u32 v5, v5, v3, 0x7fff
-; GFX10-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo
-; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX10-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
+; GFX10-NEXT: v_mov_b32_e32 v7, v5
+; GFX10-NEXT: v_mov_b32_e32 v6, v4
+; GFX10-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2046 glc
+; GFX10-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_cbranch_execnz .LBB38_1
; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: v_mov_b32_e32 v0, v3
+; GFX10-NEXT: v_mov_b32_e32 v0, v4
+; GFX10-NEXT: v_mov_b32_e32 v1, v5
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX90A-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos__align4:
+; GFX90A-LABEL: global_agent_atomic_fadd_ret_f64__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2046
-; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX90A-NEXT: s_movk_i32 s6, 0x7fff
-; GFX90A-NEXT: s_mov_b32 s7, 0xffff0000
-; GFX90A-NEXT: .LBB38_1: ; %atomicrmw.start
-; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v5, v3
-; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v5
-; GFX90A-NEXT: v_add_f32_e32 v3, v3, v2
-; GFX90A-NEXT: v_bfe_u32 v4, v3, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v3
-; GFX90A-NEXT: v_add3_u32 v4, v4, v3, s6
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; GFX90A-NEXT: v_cndmask_b32_e32 v3, v4, v6, vcc
-; GFX90A-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX90A-NEXT: v_and_or_b32 v4, v5, s7, v3
-; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2046 glc
+; GFX90A-NEXT: global_atomic_add_f64 v[0:1], v[0:1], v[2:3], off glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
-; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: s_cbranch_execnz .LBB38_1
-; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: v_mov_b32_e32 v0, v3
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
-; GFX908-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos__align4:
+; GFX908-LABEL: global_agent_atomic_fadd_ret_f64__amdgpu_no_fine_grained_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2046
+; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off
; GFX908-NEXT: s_mov_b64 s[4:5], 0
-; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX908-NEXT: s_movk_i32 s6, 0x7fff
-; GFX908-NEXT: s_mov_b32 s7, 0xffff0000
; GFX908-NEXT: .LBB38_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v4, v3
-; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v4
-; GFX908-NEXT: v_add_f32_e32 v3, v3, v2
-; GFX908-NEXT: v_bfe_u32 v5, v3, 16, 1
-; GFX908-NEXT: v_or_b32_e32 v6, 0x400000, v3
-; GFX908-NEXT: v_add3_u32 v5, v5, v3, s6
-; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
-; GFX908-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc
-; GFX908-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX908-NEXT: v_and_or_b32 v3, v4, s7, v3
-; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2046 glc
+; GFX908-NEXT: v_mov_b32_e32 v7, v5
+; GFX908-NEXT: v_mov_b32_e32 v6, v4
+; GFX908-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3]
+; GFX908-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB38_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX908-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX908-NEXT: v_mov_b32_e32 v0, v3
+; GFX908-NEXT: v_mov_b32_e32 v0, v4
+; GFX908-NEXT: v_mov_b32_e32 v1, v5
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
-; GFX8-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos__align4:
+; GFX8-LABEL: global_agent_atomic_fadd_ret_f64__amdgpu_no_fine_grained_memory:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fe, v0
-; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_load_dword v0, v[3:4]
+; GFX8-NEXT: flat_load_dwordx2 v[4:5], v[0:1]
; GFX8-NEXT: s_mov_b64 s[4:5], 0
-; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2
; GFX8-NEXT: .LBB38_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v6, v0
-; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v6
-; GFX8-NEXT: v_add_f32_e32 v0, v0, v1
-; GFX8-NEXT: v_bfe_u32 v5, v0, 16, 1
-; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v0
-; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5
-; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v0
-; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v6
-; GFX8-NEXT: v_cndmask_b32_e32 v0, v5, v7, vcc
-; GFX8-NEXT: v_or_b32_sdwa v5, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
+; GFX8-NEXT: v_mov_b32_e32 v7, v5
+; GFX8-NEXT: v_mov_b32_e32 v6, v4
+; GFX8-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3]
+; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6
+; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB38_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT: v_mov_b32_e32 v0, v4
+; GFX8-NEXT: v_mov_b32_e32 v1, v5
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX7-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos__align4:
+; GFX7-LABEL: global_agent_atomic_fadd_ret_f64__amdgpu_no_fine_grained_memory:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: s_mov_b32 s6, 0
+; GFX7-NEXT: v_mov_b32_e32 v7, v1
+; GFX7-NEXT: v_mov_b32_e32 v6, v0
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
-; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[6:7], s[4:7], 0 addr64
+; GFX7-NEXT: v_mov_b32_e32 v5, v3
+; GFX7-NEXT: v_mov_b32_e32 v4, v2
; GFX7-NEXT: s_mov_b64 s[8:9], 0
-; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
; GFX7-NEXT: .LBB38_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX7-NEXT: v_add_f32_e32 v2, v2, v6
-; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v3
-; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX7-NEXT: v_or_b32_e32 v2, v4, v2
-; GFX7-NEXT: v_mov_b32_e32 v5, v3
-; GFX7-NEXT: v_mov_b32_e32 v4, v2
-; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 offset:2046 glc
+; GFX7-NEXT: v_mov_b32_e32 v11, v1
+; GFX7-NEXT: v_mov_b32_e32 v10, v0
+; GFX7-NEXT: v_add_f64 v[8:9], v[10:11], v[4:5]
+; GFX7-NEXT: v_mov_b32_e32 v0, v8
+; GFX7-NEXT: v_mov_b32_e32 v1, v9
+; GFX7-NEXT: v_mov_b32_e32 v2, v10
+; GFX7-NEXT: v_mov_b32_e32 v3, v11
+; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[6:7], s[4:7], 0 addr64 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11]
; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT: v_mov_b32_e32 v3, v4
; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_cbranch_execnz .LBB38_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
-; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v4
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
-; GFX6-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos__align4:
+; GFX6-LABEL: global_agent_atomic_fadd_ret_f64__amdgpu_no_fine_grained_memory:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: s_mov_b32 s6, 0
+; GFX6-NEXT: v_mov_b32_e32 v7, v1
+; GFX6-NEXT: v_mov_b32_e32 v6, v0
; GFX6-NEXT: s_mov_b32 s7, 0xf000
; GFX6-NEXT: s_mov_b32 s4, s6
; GFX6-NEXT: s_mov_b32 s5, s6
-; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046
-; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GFX6-NEXT: buffer_load_dwordx2 v[0:1], v[6:7], s[4:7], 0 addr64
+; GFX6-NEXT: v_mov_b32_e32 v5, v3
+; GFX6-NEXT: v_mov_b32_e32 v4, v2
; GFX6-NEXT: s_mov_b64 s[8:9], 0
-; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
; GFX6-NEXT: .LBB38_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX6-NEXT: v_add_f32_e32 v2, v2, v6
+; GFX6-NEXT: v_mov_b32_e32 v11, v1
+; GFX6-NEXT: v_mov_b32_e32 v10, v0
+; GFX6-NEXT: v_add_f64 v[8:9], v[10:11], v[4:5]
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v3
-; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX6-NEXT: v_or_b32_e32 v2, v4, v2
-; GFX6-NEXT: v_mov_b32_e32 v5, v3
-; GFX6-NEXT: v_mov_b32_e32 v4, v2
-; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 offset:2046 glc
+; GFX6-NEXT: v_mov_b32_e32 v0, v8
+; GFX6-NEXT: v_mov_b32_e32 v1, v9
+; GFX6-NEXT: v_mov_b32_e32 v2, v10
+; GFX6-NEXT: v_mov_b32_e32 v3, v11
+; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[6:7], s[4:7], 0 addr64 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11]
; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT: v_mov_b32_e32 v3, v4
; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_cbranch_execnz .LBB38_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
-; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v4
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
- %gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 1023
- %result = atomicrmw fadd ptr addrspace(1) %gep, bfloat %val syncscope("agent") seq_cst, align 4
- ret bfloat %result
+ %result = atomicrmw fadd ptr addrspace(1) %ptr, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+ ret double %result
}
-define void @global_agent_atomic_fadd_noret_bf16__offset12b__align4_pos(ptr addrspace(1) %ptr, bfloat %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b__align4_pos:
+define double @global_agent_atomic_fadd_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, double %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fadd_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: s_wait_expcnt 0x0
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2046
-; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off offset:2040
; GFX12-NEXT: s_mov_b32 s0, 0
; GFX12-NEXT: .LBB39_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_add_f32_e32 v2, v2, v4
-; GFX12-NEXT: v_bfe_u32 v5, v2, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v6, 0x400000, v2
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_add3_u32 v5, v5, v2, 0x7fff
-; GFX12-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc_lo
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX12-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2
+; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_add_f64_e32 v[4:5], v[6:7], v[2:3]
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2046 th:TH_ATOMIC_RETURN
+; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:2040 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB39_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
-; GFX940-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b__align4_pos:
+; GFX940-LABEL: global_agent_atomic_fadd_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2046
-; GFX940-NEXT: s_mov_b64 s[0:1], 0
-; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX940-NEXT: s_movk_i32 s2, 0x7fff
-; GFX940-NEXT: s_mov_b32 s3, 0xffff0000
-; GFX940-NEXT: .LBB39_1: ; %atomicrmw.start
-; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX940-NEXT: v_add_f32_e32 v2, v2, v4
-; GFX940-NEXT: v_bfe_u32 v5, v2, 16, 1
-; GFX940-NEXT: v_or_b32_e32 v6, 0x400000, v2
-; GFX940-NEXT: v_add3_u32 v5, v5, v2, s2
-; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX940-NEXT: s_nop 1
-; GFX940-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc
-; GFX940-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX940-NEXT: v_and_or_b32 v2, v3, s3, v2
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2046 sc0
+; GFX940-NEXT: global_atomic_add_f64 v[0:1], v[0:1], v[2:3], off offset:2040 sc0
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT: v_mov_b32_e32 v3, v2
-; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT: s_cbranch_execnz .LBB39_1
-; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX940-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b__align4_pos:
+; GFX11-LABEL: global_agent_atomic_fadd_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2046
-; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off offset:2040
; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: .p2align 6
; GFX11-NEXT: .LBB39_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_f32_e32 v2, v2, v4
-; GFX11-NEXT: v_bfe_u32 v5, v2, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v2
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add3_u32 v5, v5, v2, 0x7fff
-; GFX11-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX11-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2
+; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3]
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2046 glc
+; GFX11-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:2040 glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-NEXT: v_mov_b32_e32 v3, v2
+; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: s_cbranch_execnz .LBB39_1
; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
-; GFX10-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b__align4_pos:
+; GFX10-LABEL: global_agent_atomic_fadd_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2046
-; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX10-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:2040
; GFX10-NEXT: s_mov_b32 s4, 0
; GFX10-NEXT: .LBB39_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX10-NEXT: v_add_f32_e32 v2, v2, v4
-; GFX10-NEXT: v_bfe_u32 v5, v2, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v2
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX10-NEXT: v_add3_u32 v5, v5, v2, 0x7fff
-; GFX10-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc_lo
-; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX10-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2
+; GFX10-NEXT: v_mov_b32_e32 v7, v5
+; GFX10-NEXT: v_mov_b32_e32 v6, v4
+; GFX10-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2046 glc
+; GFX10-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:2040 glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX10-NEXT: v_mov_b32_e32 v3, v2
+; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_cbranch_execnz .LBB39_1
; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT: v_mov_b32_e32 v0, v4
+; GFX10-NEXT: v_mov_b32_e32 v1, v5
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX90A-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b__align4_pos:
+; GFX90A-LABEL: global_agent_atomic_fadd_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2046
-; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX90A-NEXT: s_movk_i32 s6, 0x7fff
-; GFX90A-NEXT: s_mov_b32 s7, 0xffff0000
-; GFX90A-NEXT: .LBB39_1: ; %atomicrmw.start
-; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX90A-NEXT: v_add_f32_e32 v2, v2, v4
-; GFX90A-NEXT: v_bfe_u32 v5, v2, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v2
-; GFX90A-NEXT: v_add3_u32 v5, v5, v2, s6
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX90A-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc
-; GFX90A-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX90A-NEXT: v_and_or_b32 v2, v3, s7, v2
-; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2046 glc
+; GFX90A-NEXT: global_atomic_add_f64 v[0:1], v[0:1], v[2:3], off offset:2040 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: v_mov_b32_e32 v3, v2
-; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: s_cbranch_execnz .LBB39_1
-; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
-; GFX908-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b__align4_pos:
+; GFX908-LABEL: global_agent_atomic_fadd_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2046
+; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:2040
; GFX908-NEXT: s_mov_b64 s[4:5], 0
-; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX908-NEXT: s_movk_i32 s6, 0x7fff
-; GFX908-NEXT: s_mov_b32 s7, 0xffff0000
; GFX908-NEXT: .LBB39_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX908-NEXT: v_add_f32_e32 v2, v2, v4
-; GFX908-NEXT: v_bfe_u32 v5, v2, 16, 1
-; GFX908-NEXT: v_or_b32_e32 v6, 0x400000, v2
-; GFX908-NEXT: v_add3_u32 v5, v5, v2, s6
-; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX908-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc
-; GFX908-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX908-NEXT: v_and_or_b32 v2, v3, s7, v2
-; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2046 glc
+; GFX908-NEXT: v_mov_b32_e32 v7, v5
+; GFX908-NEXT: v_mov_b32_e32 v6, v4
+; GFX908-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3]
+; GFX908-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:2040 glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT: v_mov_b32_e32 v3, v2
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB39_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX908-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT: v_mov_b32_e32 v0, v4
+; GFX908-NEXT: v_mov_b32_e32 v1, v5
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
-; GFX8-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b__align4_pos:
+; GFX8-LABEL: global_agent_atomic_fadd_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fe, v0
-; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_load_dword v3, v[0:1]
+; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7f8, v0
+; GFX8-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
+; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[4:5]
; GFX8-NEXT: s_mov_b64 s[4:5], 0
-; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX8-NEXT: .LBB39_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX8-NEXT: v_add_f32_e32 v2, v2, v4
-; GFX8-NEXT: v_bfe_u32 v6, v2, 16, 1
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v2
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6
-; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v2
-; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
-; GFX8-NEXT: v_cndmask_b32_e32 v2, v6, v7, vcc
-; GFX8-NEXT: v_or_b32_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX8-NEXT: v_mov_b32_e32 v9, v1
+; GFX8-NEXT: v_mov_b32_e32 v8, v0
+; GFX8-NEXT: v_add_f64 v[6:7], v[8:9], v[2:3]
+; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT: v_mov_b32_e32 v3, v2
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB39_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX7-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b__align4_pos:
+; GFX7-LABEL: global_agent_atomic_fadd_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: s_mov_b32 s6, 0
+; GFX7-NEXT: v_mov_b32_e32 v7, v1
+; GFX7-NEXT: v_mov_b32_e32 v6, v0
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
-; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[6:7], s[4:7], 0 addr64 offset:2040
+; GFX7-NEXT: v_mov_b32_e32 v5, v3
+; GFX7-NEXT: v_mov_b32_e32 v4, v2
; GFX7-NEXT: s_mov_b64 s[8:9], 0
-; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
; GFX7-NEXT: .LBB39_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX7-NEXT: v_add_f32_e32 v2, v2, v4
-; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
-; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX7-NEXT: v_or_b32_e32 v2, v5, v2
-; GFX7-NEXT: v_mov_b32_e32 v6, v3
-; GFX7-NEXT: v_mov_b32_e32 v5, v2
-; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2046 glc
+; GFX7-NEXT: v_mov_b32_e32 v11, v1
+; GFX7-NEXT: v_mov_b32_e32 v10, v0
+; GFX7-NEXT: v_add_f64 v[8:9], v[10:11], v[4:5]
+; GFX7-NEXT: v_mov_b32_e32 v0, v8
+; GFX7-NEXT: v_mov_b32_e32 v1, v9
+; GFX7-NEXT: v_mov_b32_e32 v2, v10
+; GFX7-NEXT: v_mov_b32_e32 v3, v11
+; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[6:7], s[4:7], 0 addr64 offset:2040 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v3
+; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11]
; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT: v_mov_b32_e32 v3, v5
; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_cbranch_execnz .LBB39_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
-; GFX6-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b__align4_pos:
+; GFX6-LABEL: global_agent_atomic_fadd_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: s_mov_b32 s6, 0
+; GFX6-NEXT: v_mov_b32_e32 v7, v1
+; GFX6-NEXT: v_mov_b32_e32 v6, v0
; GFX6-NEXT: s_mov_b32 s7, 0xf000
; GFX6-NEXT: s_mov_b32 s4, s6
; GFX6-NEXT: s_mov_b32 s5, s6
-; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046
-; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GFX6-NEXT: buffer_load_dwordx2 v[0:1], v[6:7], s[4:7], 0 addr64 offset:2040
+; GFX6-NEXT: v_mov_b32_e32 v5, v3
+; GFX6-NEXT: v_mov_b32_e32 v4, v2
; GFX6-NEXT: s_mov_b64 s[8:9], 0
-; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
; GFX6-NEXT: .LBB39_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX6-NEXT: v_add_f32_e32 v2, v2, v4
+; GFX6-NEXT: v_mov_b32_e32 v11, v1
+; GFX6-NEXT: v_mov_b32_e32 v10, v0
+; GFX6-NEXT: v_add_f64 v[8:9], v[10:11], v[4:5]
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
-; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX6-NEXT: v_or_b32_e32 v2, v5, v2
-; GFX6-NEXT: v_mov_b32_e32 v6, v3
-; GFX6-NEXT: v_mov_b32_e32 v5, v2
-; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2046 glc
+; GFX6-NEXT: v_mov_b32_e32 v0, v8
+; GFX6-NEXT: v_mov_b32_e32 v1, v9
+; GFX6-NEXT: v_mov_b32_e32 v2, v10
+; GFX6-NEXT: v_mov_b32_e32 v3, v11
+; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[6:7], s[4:7], 0 addr64 offset:2040 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v3
+; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11]
; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT: v_mov_b32_e32 v3, v5
; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_cbranch_execnz .LBB39_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
- %gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 1023
- %unused = atomicrmw fadd ptr addrspace(1) %gep, bfloat %val syncscope("agent") seq_cst, align 4
- ret void
+ %gep = getelementptr double, ptr addrspace(1) %ptr, i64 255
+ %result = atomicrmw fadd ptr addrspace(1) %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+ ret double %result
}
-define bfloat @global_system_atomic_fadd_ret_bf16__offset12b_pos(ptr addrspace(1) %ptr, bfloat %val) #0 {
-; GFX12-LABEL: global_system_atomic_fadd_ret_bf16__offset12b_pos:
+define double @global_agent_atomic_fadd_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, double %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fadd_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: s_wait_expcnt 0x0
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
-; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
-; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off offset:-2048
; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: global_load_b32 v5, v[0:1], off
-; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3
-; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_not_b32_e32 v4, v4
; GFX12-NEXT: .LBB40_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v6, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6
-; GFX12-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_add_f32_e32 v5, v5, v2
-; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v5
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX12-NEXT: v_add_f64_e32 v[4:5], v[6:7], v[2:3]
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN
+; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:-2048 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_SYS
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_cbranch_execnz .LBB40_1
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
-; GFX940-LABEL: global_system_atomic_fadd_ret_bf16__offset12b_pos:
+; GFX940-LABEL: global_agent_atomic_fadd_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: s_mov_b64 s[0:1], 0x7fe
-; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
-; GFX940-NEXT: v_and_b32_e32 v0, -4, v4
-; GFX940-NEXT: v_mov_b32_e32 v1, v5
-; GFX940-NEXT: global_load_dword v5, v[0:1], off
-; GFX940-NEXT: v_and_b32_e32 v3, 3, v4
-; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3
-; GFX940-NEXT: s_mov_b32 s0, 0xffff
-; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0
-; GFX940-NEXT: v_not_b32_e32 v4, v4
-; GFX940-NEXT: s_mov_b64 s[0:1], 0
-; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX940-NEXT: s_movk_i32 s2, 0x7fff
-; GFX940-NEXT: .LBB40_1: ; %atomicrmw.start
-; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: v_mov_b32_e32 v7, v5
-; GFX940-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_add_f32_e32 v5, v5, v2
-; GFX940-NEXT: v_bfe_u32 v6, v5, 16, 1
-; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v5
-; GFX940-NEXT: v_add3_u32 v6, v6, v5, s2
-; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX940-NEXT: s_nop 1
-; GFX940-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc
-; GFX940-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX940-NEXT: v_and_or_b32 v6, v7, v4, v5
-; GFX940-NEXT: buffer_wbl2 sc0 sc1
-; GFX940-NEXT: global_atomic_cmpswap v5, v[0:1], v[6:7], off sc0 sc1
+; GFX940-NEXT: buffer_wbl2 sc1
+; GFX940-NEXT: global_atomic_add_f64 v[0:1], v[0:1], v[2:3], off offset:-2048 sc0
; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: buffer_inv sc0 sc1
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7
-; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT: s_cbranch_execnz .LBB40_1
-; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX940-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX940-NEXT: buffer_inv sc1
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: global_system_atomic_fadd_ret_bf16__offset12b_pos:
+; GFX11-LABEL: global_agent_atomic_fadd_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX11-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX11-NEXT: global_load_b64 v[4:5], v[0:1], off offset:-2048
; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: global_load_b32 v5, v[0:1], off
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3
-; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_not_b32_e32 v4, v4
-; GFX11-NEXT: .p2align 6
; GFX11-NEXT: .LBB40_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v6, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6
-; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_f32_e32 v5, v5, v2
-; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v5
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX11-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3]
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc
+; GFX11-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:-2048 glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: s_cbranch_execnz .LBB40_1
; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
-; GFX10-LABEL: global_system_atomic_fadd_ret_bf16__offset12b_pos:
+; GFX10-LABEL: global_agent_atomic_fadd_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
-; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
-; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX10-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX10-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX10-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:-2048
; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: global_load_dword v5, v[0:1], off
-; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3
-; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
-; GFX10-NEXT: v_not_b32_e32 v4, v4
; GFX10-NEXT: .LBB40_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v6, v5
-; GFX10-NEXT: v_lshrrev_b32_sdwa v5, v3, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX10-NEXT: v_add_f32_e32 v5, v5, v2
-; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v5
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
-; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
-; GFX10-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX10-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX10-NEXT: v_mov_b32_e32 v7, v5
+; GFX10-NEXT: v_mov_b32_e32 v6, v4
+; GFX10-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: global_atomic_cmpswap v5, v[0:1], v[5:6], off glc
+; GFX10-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:-2048 glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_cbranch_execnz .LBB40_1
; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX10-NEXT: v_mov_b32_e32 v0, v4
+; GFX10-NEXT: v_mov_b32_e32 v1, v5
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX90A-LABEL: global_system_atomic_fadd_ret_bf16__offset12b_pos:
+; GFX90A-LABEL: global_agent_atomic_fadd_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0
-; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX90A-NEXT: global_load_dword v5, v[0:1], off
-; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3
-; GFX90A-NEXT: s_mov_b32 s4, 0xffff
-; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4
-; GFX90A-NEXT: v_not_b32_e32 v4, v4
-; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX90A-NEXT: s_movk_i32 s6, 0x7fff
-; GFX90A-NEXT: .LBB40_1: ; %atomicrmw.start
-; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT: global_atomic_add_f64 v[0:1], v[0:1], v[2:3], off offset:-2048 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v7, v5
-; GFX90A-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX90A-NEXT: v_add_f32_e32 v5, v5, v2
-; GFX90A-NEXT: v_bfe_u32 v6, v5, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v5
-; GFX90A-NEXT: v_add3_u32 v6, v6, v5, s6
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX90A-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc
-; GFX90A-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX90A-NEXT: v_and_or_b32 v6, v7, v4, v5
-; GFX90A-NEXT: buffer_wbl2
-; GFX90A-NEXT: global_atomic_cmpswap v5, v[0:1], v[6:7], off glc
-; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: buffer_invl2
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7
-; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: s_cbranch_execnz .LBB40_1
-; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
-; GFX908-LABEL: global_system_atomic_fadd_ret_bf16__offset12b_pos:
+; GFX908-LABEL: global_agent_atomic_fadd_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0
-; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX908-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX908-NEXT: global_load_dword v5, v[0:1], off
-; GFX908-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX908-NEXT: v_lshlrev_b32_e32 v3, 3, v3
-; GFX908-NEXT: s_mov_b32 s4, 0xffff
-; GFX908-NEXT: v_lshlrev_b32_e64 v4, v3, s4
-; GFX908-NEXT: v_not_b32_e32 v4, v4
+; GFX908-NEXT: global_load_dwordx2 v[4:5], v[0:1], off offset:-2048
; GFX908-NEXT: s_mov_b64 s[4:5], 0
-; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX908-NEXT: s_movk_i32 s6, 0x7fff
; GFX908-NEXT: .LBB40_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v6, v5
-; GFX908-NEXT: v_lshrrev_b32_sdwa v5, v3, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX908-NEXT: v_add_f32_e32 v5, v5, v2
-; GFX908-NEXT: v_bfe_u32 v7, v5, 16, 1
-; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v5
-; GFX908-NEXT: v_add3_u32 v7, v7, v5, s6
-; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX908-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc
-; GFX908-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX908-NEXT: v_and_or_b32 v5, v6, v4, v5
-; GFX908-NEXT: global_atomic_cmpswap v5, v[0:1], v[5:6], off glc
+; GFX908-NEXT: v_mov_b32_e32 v7, v5
+; GFX908-NEXT: v_mov_b32_e32 v6, v4
+; GFX908-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3]
+; GFX908-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:-2048 glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6
+; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB40_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX908-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX908-NEXT: v_mov_b32_e32 v0, v4
+; GFX908-NEXT: v_mov_b32_e32 v1, v5
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
-; GFX8-LABEL: global_system_atomic_fadd_ret_bf16__offset12b_pos:
+; GFX8-LABEL: global_agent_atomic_fadd_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fe, v0
-; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: v_and_b32_e32 v0, -4, v3
-; GFX8-NEXT: flat_load_dword v5, v[0:1]
-; GFX8-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX8-NEXT: v_lshlrev_b32_e32 v3, 3, v3
-; GFX8-NEXT: s_mov_b32 s4, 0xffff
-; GFX8-NEXT: v_lshlrev_b32_e64 v4, v3, s4
-; GFX8-NEXT: v_not_b32_e32 v4, v4
+; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0xfffff800, v0
+; GFX8-NEXT: v_addc_u32_e32 v5, vcc, -1, v1, vcc
+; GFX8-NEXT: flat_load_dwordx2 v[0:1], v[4:5]
; GFX8-NEXT: s_mov_b64 s[4:5], 0
-; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX8-NEXT: .LBB40_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v6, v5
-; GFX8-NEXT: v_lshrrev_b32_sdwa v5, v3, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT: v_add_f32_e32 v5, v5, v2
-; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5
-; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8
-; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5
-; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX8-NEXT: v_and_b32_e32 v7, v6, v4
-; GFX8-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-NEXT: v_or_b32_e32 v5, v7, v5
-; GFX8-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
+; GFX8-NEXT: v_mov_b32_e32 v9, v1
+; GFX8-NEXT: v_mov_b32_e32 v8, v0
+; GFX8-NEXT: v_add_f64 v[6:7], v[8:9], v[2:3]
+; GFX8-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6
+; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB40_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX8-NEXT: v_lshrrev_b32_e32 v0, v3, v5
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX7-LABEL: global_system_atomic_fadd_ret_bf16__offset12b_pos:
+; GFX7-LABEL: global_agent_atomic_fadd_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0
-; GFX7-NEXT: s_mov_b32 s6, 0
-; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX7-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX7-NEXT: s_movk_i32 s4, 0xf800
+; GFX7-NEXT: v_mov_b32_e32 v7, v1
+; GFX7-NEXT: v_mov_b32_e32 v6, v0
+; GFX7-NEXT: s_mov_b32 s5, -1
; GFX7-NEXT: s_mov_b32 s7, 0xf000
+; GFX7-NEXT: s_mov_b32 s6, 0
+; GFX7-NEXT: buffer_load_dwordx2 v[0:1], v[6:7], s[4:7], 0 addr64
+; GFX7-NEXT: v_add_i32_e32 v6, vcc, 0xfffff800, v6
+; GFX7-NEXT: v_mov_b32_e32 v5, v3
+; GFX7-NEXT: v_mov_b32_e32 v4, v2
+; GFX7-NEXT: v_addc_u32_e32 v7, vcc, -1, v7, vcc
+; GFX7-NEXT: s_mov_b64 s[8:9], 0
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
-; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX7-NEXT: v_lshlrev_b32_e32 v6, 3, v4
-; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v6
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT: v_not_b32_e32 v7, v4
-; GFX7-NEXT: s_mov_b64 s[8:9], 0
-; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v2
; GFX7-NEXT: .LBB40_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v2, v6, v3
-; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX7-NEXT: v_add_f32_e32 v2, v2, v8
-; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX7-NEXT: v_and_b32_e32 v4, v3, v7
-; GFX7-NEXT: v_lshlrev_b32_e32 v2, v6, v2
-; GFX7-NEXT: v_or_b32_e32 v2, v4, v2
-; GFX7-NEXT: v_mov_b32_e32 v5, v3
-; GFX7-NEXT: v_mov_b32_e32 v4, v2
-; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT: v_mov_b32_e32 v11, v1
+; GFX7-NEXT: v_mov_b32_e32 v10, v0
+; GFX7-NEXT: v_add_f64 v[8:9], v[10:11], v[4:5]
+; GFX7-NEXT: v_mov_b32_e32 v0, v8
+; GFX7-NEXT: v_mov_b32_e32 v1, v9
+; GFX7-NEXT: v_mov_b32_e32 v2, v10
+; GFX7-NEXT: v_mov_b32_e32 v3, v11
+; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[6:7], s[4:7], 0 addr64 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11]
; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT: v_mov_b32_e32 v3, v4
; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_cbranch_execnz .LBB40_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, v6, v4
-; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
-; GFX6-LABEL: global_system_atomic_fadd_ret_bf16__offset12b_pos:
+; GFX6-LABEL: global_agent_atomic_fadd_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0
-; GFX6-NEXT: s_mov_b32 s6, 0
-; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX6-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX6-NEXT: s_movk_i32 s4, 0xf800
+; GFX6-NEXT: v_mov_b32_e32 v7, v1
+; GFX6-NEXT: v_mov_b32_e32 v6, v0
+; GFX6-NEXT: s_mov_b32 s5, -1
; GFX6-NEXT: s_mov_b32 s7, 0xf000
+; GFX6-NEXT: s_mov_b32 s6, 0
+; GFX6-NEXT: buffer_load_dwordx2 v[0:1], v[6:7], s[4:7], 0 addr64
+; GFX6-NEXT: v_add_i32_e32 v6, vcc, 0xfffff800, v6
+; GFX6-NEXT: v_mov_b32_e32 v5, v3
+; GFX6-NEXT: v_mov_b32_e32 v4, v2
+; GFX6-NEXT: v_addc_u32_e32 v7, vcc, -1, v7, vcc
+; GFX6-NEXT: s_mov_b64 s[8:9], 0
; GFX6-NEXT: s_mov_b32 s4, s6
; GFX6-NEXT: s_mov_b32 s5, s6
-; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX6-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX6-NEXT: v_lshlrev_b32_e32 v6, 3, v4
-; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v6
-; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX6-NEXT: v_not_b32_e32 v7, v4
-; GFX6-NEXT: s_mov_b64 s[8:9], 0
-; GFX6-NEXT: v_and_b32_e32 v8, 0xffff0000, v2
; GFX6-NEXT: .LBB40_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_lshrrev_b32_e32 v2, v6, v3
-; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX6-NEXT: v_add_f32_e32 v2, v2, v8
-; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX6-NEXT: v_mov_b32_e32 v11, v1
+; GFX6-NEXT: v_mov_b32_e32 v10, v0
+; GFX6-NEXT: v_add_f64 v[8:9], v[10:11], v[4:5]
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v4, v3, v7
-; GFX6-NEXT: v_lshlrev_b32_e32 v2, v6, v2
-; GFX6-NEXT: v_or_b32_e32 v2, v4, v2
-; GFX6-NEXT: v_mov_b32_e32 v5, v3
-; GFX6-NEXT: v_mov_b32_e32 v4, v2
-; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT: v_mov_b32_e32 v0, v8
+; GFX6-NEXT: v_mov_b32_e32 v1, v9
+; GFX6-NEXT: v_mov_b32_e32 v2, v10
+; GFX6-NEXT: v_mov_b32_e32 v3, v11
+; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[6:7], s[4:7], 0 addr64 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11]
; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT: v_mov_b32_e32 v3, v4
; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_cbranch_execnz .LBB40_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
-; GFX6-NEXT: v_lshrrev_b32_e32 v0, v6, v4
-; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
- %gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 1023
- %result = atomicrmw fadd ptr addrspace(1) %gep, bfloat %val seq_cst
- ret bfloat %result
+ %gep = getelementptr double, ptr addrspace(1) %ptr, i64 -256
+ %result = atomicrmw fadd ptr addrspace(1) %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+ ret double %result
}
-define void @global_system_atomic_fadd_noret_bf16__offset12b_pos(ptr addrspace(1) %ptr, bfloat %val) #0 {
-; GFX12-LABEL: global_system_atomic_fadd_noret_bf16__offset12b_pos:
+define void @global_agent_atomic_fadd_noret_f64__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, double %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fadd_noret_f64__amdgpu_no_fine_grained_memory:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: s_wait_expcnt 0x0
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
-; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
-; GFX12-NEXT: v_lshlrev_b32_e32 v6, 16, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_and_b32_e32 v0, -4, v4
-; GFX12-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX12-NEXT: global_load_b64 v[6:7], v[0:1], off
; GFX12-NEXT: s_mov_b32 s0, 0
-; GFX12-NEXT: global_load_b32 v3, v[0:1], off
-; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4
-; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT: v_not_b32_e32 v5, v5
; GFX12-NEXT: .LBB41_1: ; %atomicrmw.start
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_lshrrev_b32_e32 v2, v4, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX12-NEXT: v_add_f32_e32 v2, v2, v6
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo
-; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2
-; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2
+; GFX12-NEXT: v_add_f64_e32 v[4:5], v[6:7], v[2:3]
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN
+; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_SYS
-; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX12-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
@@ -10854,90 +7141,31 @@ define void @global_system_atomic_fadd_noret_bf16__offset12b_pos(ptr addrspace(1
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
-; GFX940-LABEL: global_system_atomic_fadd_noret_bf16__offset12b_pos:
+; GFX940-LABEL: global_agent_atomic_fadd_noret_f64__amdgpu_no_fine_grained_memory:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: s_mov_b64 s[0:1], 0x7fe
-; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
-; GFX940-NEXT: v_and_b32_e32 v0, -4, v4
-; GFX940-NEXT: v_mov_b32_e32 v1, v5
-; GFX940-NEXT: global_load_dword v3, v[0:1], off
-; GFX940-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX940-NEXT: v_lshlrev_b32_e32 v4, 3, v4
-; GFX940-NEXT: s_mov_b32 s0, 0xffff
-; GFX940-NEXT: v_lshlrev_b32_e64 v5, v4, s0
-; GFX940-NEXT: v_not_b32_e32 v5, v5
-; GFX940-NEXT: s_mov_b64 s[0:1], 0
-; GFX940-NEXT: v_lshlrev_b32_e32 v6, 16, v2
-; GFX940-NEXT: s_movk_i32 s2, 0x7fff
-; GFX940-NEXT: .LBB41_1: ; %atomicrmw.start
-; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX940-NEXT: s_nop 0
-; GFX940-NEXT: v_add_f32_e32 v2, v2, v6
-; GFX940-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX940-NEXT: v_add3_u32 v7, v7, v2, s2
-; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX940-NEXT: s_nop 1
-; GFX940-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc
-; GFX940-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX940-NEXT: v_and_or_b32 v2, v3, v5, v2
-; GFX940-NEXT: buffer_wbl2 sc0 sc1
-; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 sc1
+; GFX940-NEXT: buffer_wbl2 sc1
+; GFX940-NEXT: global_atomic_add_f64 v[0:1], v[2:3], off
; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: buffer_inv sc0 sc1
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT: v_mov_b32_e32 v3, v2
-; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT: s_cbranch_execnz .LBB41_1
-; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX940-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX940-NEXT: buffer_inv sc1
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: global_system_atomic_fadd_noret_bf16__offset12b_pos:
+; GFX11-LABEL: global_agent_atomic_fadd_noret_f64__amdgpu_no_fine_grained_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
-; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v0, -4, v4
-; GFX11-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX11-NEXT: global_load_b64 v[6:7], v[0:1], off
; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: global_load_b32 v3, v[0:1], off
-; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v4
-; GFX11-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_not_b32_e32 v5, v5
-; GFX11-NEXT: .p2align 6
; GFX11-NEXT: .LBB41_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, v4, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-NEXT: v_add_f32_e32 v2, v2, v6
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, v4, v2
-; GFX11-NEXT: v_and_or_b32 v2, v3, v5, v2
+; GFX11-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3]
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc
+; GFX11-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-NEXT: v_mov_b32_e32 v3, v2
+; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
@@ -10946,38 +7174,23 @@ define void @global_system_atomic_fadd_noret_bf16__offset12b_pos(ptr addrspace(1
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
-; GFX10-LABEL: global_system_atomic_fadd_noret_bf16__offset12b_pos:
+; GFX10-LABEL: global_agent_atomic_fadd_noret_f64__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
-; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
-; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v2
-; GFX10-NEXT: v_and_b32_e32 v0, -4, v4
-; GFX10-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX10-NEXT: global_load_dwordx2 v[6:7], v[0:1], off
; GFX10-NEXT: s_mov_b32 s4, 0
-; GFX10-NEXT: global_load_dword v3, v[0:1], off
-; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v4
-; GFX10-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
-; GFX10-NEXT: v_not_b32_e32 v5, v5
; GFX10-NEXT: .LBB41_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX10-NEXT: v_add_f32_e32 v2, v2, v6
-; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
-; GFX10-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
-; GFX10-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo
-; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX10-NEXT: v_and_or_b32 v2, v3, v5, v2
+; GFX10-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
+; GFX10-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX10-NEXT: v_mov_b32_e32 v3, v2
+; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX10-NEXT: v_mov_b32_e32 v7, v5
+; GFX10-NEXT: v_mov_b32_e32 v6, v4
; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_cbranch_execnz .LBB41_1
@@ -10985,1397 +7198,1680 @@ define void @global_system_atomic_fadd_noret_bf16__offset12b_pos(ptr addrspace(1
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX90A-LABEL: global_system_atomic_fadd_noret_bf16__offset12b_pos:
+; GFX90A-LABEL: global_agent_atomic_fadd_noret_f64__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0
-; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX90A-NEXT: v_and_b32_e32 v0, -4, v4
-; GFX90A-NEXT: global_load_dword v3, v[0:1], off
-; GFX90A-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 3, v4
-; GFX90A-NEXT: s_mov_b32 s4, 0xffff
-; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v4, s4
-; GFX90A-NEXT: v_not_b32_e32 v5, v5
-; GFX90A-NEXT: s_mov_b64 s[4:5], 0
-; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v2
-; GFX90A-NEXT: s_movk_i32 s6, 0x7fff
-; GFX90A-NEXT: .LBB41_1: ; %atomicrmw.start
-; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX90A-NEXT: v_add_f32_e32 v2, v2, v6
-; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s6
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX90A-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc
-; GFX90A-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX90A-NEXT: v_and_or_b32 v2, v3, v5, v2
-; GFX90A-NEXT: buffer_wbl2
-; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
+; GFX90A-NEXT: global_atomic_add_f64 v[0:1], v[2:3], off
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: buffer_invl2
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT: v_mov_b32_e32 v3, v2
-; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX90A-NEXT: s_cbranch_execnz .LBB41_1
-; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
-; GFX908-LABEL: global_system_atomic_fadd_noret_bf16__offset12b_pos:
+; GFX908-LABEL: global_agent_atomic_fadd_noret_f64__amdgpu_no_fine_grained_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0
-; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX908-NEXT: v_and_b32_e32 v0, -4, v4
-; GFX908-NEXT: global_load_dword v3, v[0:1], off
-; GFX908-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX908-NEXT: v_lshlrev_b32_e32 v4, 3, v4
-; GFX908-NEXT: s_mov_b32 s4, 0xffff
-; GFX908-NEXT: v_lshlrev_b32_e64 v5, v4, s4
-; GFX908-NEXT: v_not_b32_e32 v5, v5
+; GFX908-NEXT: global_load_dwordx2 v[6:7], v[0:1], off
; GFX908-NEXT: s_mov_b64 s[4:5], 0
-; GFX908-NEXT: v_lshlrev_b32_e32 v6, 16, v2
-; GFX908-NEXT: s_movk_i32 s6, 0x7fff
; GFX908-NEXT: .LBB41_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX908-NEXT: v_add_f32_e32 v2, v2, v6
-; GFX908-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX908-NEXT: v_add3_u32 v7, v7, v2, s6
-; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX908-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc
-; GFX908-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX908-NEXT: v_and_or_b32 v2, v3, v5, v2
-; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
+; GFX908-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3]
+; GFX908-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX908-NEXT: v_mov_b32_e32 v7, v5
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT: v_mov_b32_e32 v3, v2
+; GFX908-NEXT: v_mov_b32_e32 v6, v4
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB41_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX908-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
-; GFX8-LABEL: global_system_atomic_fadd_noret_bf16__offset12b_pos:
+; GFX8-LABEL: global_agent_atomic_fadd_noret_f64__amdgpu_no_fine_grained_memory:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fe, v0
-; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: v_and_b32_e32 v0, -4, v4
-; GFX8-NEXT: flat_load_dword v3, v[0:1]
-; GFX8-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX8-NEXT: v_lshlrev_b32_e32 v4, 3, v4
-; GFX8-NEXT: s_mov_b32 s4, 0xffff
-; GFX8-NEXT: v_lshlrev_b32_e64 v5, v4, s4
-; GFX8-NEXT: v_not_b32_e32 v5, v5
+; GFX8-NEXT: flat_load_dwordx2 v[6:7], v[0:1]
; GFX8-NEXT: s_mov_b64 s[4:5], 0
-; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v2
; GFX8-NEXT: .LBB41_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT: v_add_f32_e32 v2, v2, v6
-; GFX8-NEXT: v_bfe_u32 v8, v2, 16, 1
-; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v2
-; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8
-; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v2
-; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
-; GFX8-NEXT: v_cndmask_b32_e32 v2, v8, v9, vcc
-; GFX8-NEXT: v_and_b32_e32 v7, v3, v5
-; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-NEXT: v_or_b32_e32 v2, v7, v2
-; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX8-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3]
+; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX8-NEXT: v_mov_b32_e32 v7, v5
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT: v_mov_b32_e32 v3, v2
+; GFX8-NEXT: v_mov_b32_e32 v6, v4
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB41_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX7-LABEL: global_system_atomic_fadd_noret_bf16__offset12b_pos:
+; GFX7-LABEL: global_agent_atomic_fadd_noret_f64__amdgpu_no_fine_grained_memory:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0
; GFX7-NEXT: s_mov_b32 s6, 0
-; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX7-NEXT: v_and_b32_e32 v0, -4, v4
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
-; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4
-; GFX7-NEXT: v_lshl_b32_e32 v5, 0xffff, v4
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT: v_not_b32_e32 v5, v5
+; GFX7-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64
; GFX7-NEXT: s_mov_b64 s[8:9], 0
-; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
; GFX7-NEXT: .LBB41_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3
-; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX7-NEXT: v_add_f32_e32 v2, v2, v6
-; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX7-NEXT: v_and_b32_e32 v7, v3, v5
-; GFX7-NEXT: v_lshlrev_b32_e32 v2, v4, v2
-; GFX7-NEXT: v_or_b32_e32 v2, v7, v2
-; GFX7-NEXT: v_mov_b32_e32 v8, v3
-; GFX7-NEXT: v_mov_b32_e32 v7, v2
-; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3]
+; GFX7-NEXT: v_mov_b32_e32 v11, v7
+; GFX7-NEXT: v_mov_b32_e32 v10, v6
+; GFX7-NEXT: v_mov_b32_e32 v9, v5
+; GFX7-NEXT: v_mov_b32_e32 v8, v4
+; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3
+; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v6, v8
; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT: v_mov_b32_e32 v3, v7
+; GFX7-NEXT: v_mov_b32_e32 v7, v9
; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_cbranch_execnz .LBB41_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
-; GFX6-LABEL: global_system_atomic_fadd_noret_bf16__offset12b_pos:
+; GFX6-LABEL: global_agent_atomic_fadd_noret_f64__amdgpu_no_fine_grained_memory:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0
; GFX6-NEXT: s_mov_b32 s6, 0
-; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX6-NEXT: v_and_b32_e32 v0, -4, v4
; GFX6-NEXT: s_mov_b32 s7, 0xf000
; GFX6-NEXT: s_mov_b32 s4, s6
; GFX6-NEXT: s_mov_b32 s5, s6
-; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX6-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX6-NEXT: v_lshlrev_b32_e32 v4, 3, v4
-; GFX6-NEXT: v_lshl_b32_e32 v5, 0xffff, v4
-; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX6-NEXT: v_not_b32_e32 v5, v5
+; GFX6-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64
; GFX6-NEXT: s_mov_b64 s[8:9], 0
-; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
; GFX6-NEXT: .LBB41_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_lshrrev_b32_e32 v2, v4, v3
-; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX6-NEXT: v_add_f32_e32 v2, v2, v6
-; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX6-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3]
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v7, v3, v5
-; GFX6-NEXT: v_lshlrev_b32_e32 v2, v4, v2
-; GFX6-NEXT: v_or_b32_e32 v2, v7, v2
-; GFX6-NEXT: v_mov_b32_e32 v8, v3
-; GFX6-NEXT: v_mov_b32_e32 v7, v2
-; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT: v_mov_b32_e32 v11, v7
+; GFX6-NEXT: v_mov_b32_e32 v10, v6
+; GFX6-NEXT: v_mov_b32_e32 v9, v5
+; GFX6-NEXT: v_mov_b32_e32 v8, v4
+; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3
+; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v6, v8
; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT: v_mov_b32_e32 v3, v7
+; GFX6-NEXT: v_mov_b32_e32 v7, v9
; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_cbranch_execnz .LBB41_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
- %gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 1023
- %unused = atomicrmw fadd ptr addrspace(1) %gep, bfloat %val seq_cst
+ %unused = atomicrmw fadd ptr addrspace(1) %ptr, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
ret void
}
-; --------------------------------------------------------------------
-; <2 x half>
-; --------------------------------------------------------------------
-
-define <2 x half> @global_agent_atomic_fadd_ret_v2f16(ptr addrspace(1) %ptr, <2 x half> %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fadd_ret_v2f16:
+define void @global_agent_atomic_fadd_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, double %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fadd_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: s_wait_expcnt 0x0
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: global_load_b64 v[6:7], v[0:1], off offset:2040
+; GFX12-NEXT: s_mov_b32 s0, 0
+; GFX12-NEXT: .LBB42_1: ; %atomicrmw.start
+; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: v_add_f64_e32 v[4:5], v[6:7], v[2:3]
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN
+; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:2040 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
+; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_cbranch_execnz .LBB42_1
+; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
-; GFX940-LABEL: global_agent_atomic_fadd_ret_v2f16:
+; GFX940-LABEL: global_agent_atomic_fadd_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off sc0
+; GFX940-NEXT: global_atomic_add_f64 v[0:1], v[2:3], off offset:2040
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc1
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: global_agent_atomic_fadd_ret_v2f16:
+; GFX11-LABEL: global_agent_atomic_fadd_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v3, v[0:1], off
+; GFX11-NEXT: global_load_b64 v[6:7], v[0:1], off offset:2040
; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: .LBB42_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v4, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_pk_add_f16 v3, v4, v2
+; GFX11-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3]
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc
+; GFX11-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:2040 glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: s_cbranch_execnz .LBB42_1
; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: v_mov_b32_e32 v0, v3
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
-; GFX10-LABEL: global_agent_atomic_fadd_ret_v2f16:
+; GFX10-LABEL: global_agent_atomic_fadd_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v3, v[0:1], off
+; GFX10-NEXT: global_load_dwordx2 v[6:7], v[0:1], off offset:2040
; GFX10-NEXT: s_mov_b32 s4, 0
; GFX10-NEXT: .LBB42_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v4, v3
-; GFX10-NEXT: v_pk_add_f16 v3, v4, v2
+; GFX10-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
+; GFX10-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:2040 glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
-; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX10-NEXT: v_mov_b32_e32 v7, v5
+; GFX10-NEXT: v_mov_b32_e32 v6, v4
+; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_cbranch_execnz .LBB42_1
; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: v_mov_b32_e32 v0, v3
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX90A-LABEL: global_agent_atomic_fadd_ret_v2f16:
+; GFX90A-LABEL: global_agent_atomic_fadd_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off glc
+; GFX90A-NEXT: global_atomic_add_f64 v[0:1], v[2:3], off offset:2040
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
-; GFX908-LABEL: global_agent_atomic_fadd_ret_v2f16:
+; GFX908-LABEL: global_agent_atomic_fadd_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: global_load_dword v3, v[0:1], off
+; GFX908-NEXT: global_load_dwordx2 v[6:7], v[0:1], off offset:2040
; GFX908-NEXT: s_mov_b64 s[4:5], 0
; GFX908-NEXT: .LBB42_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v4, v3
-; GFX908-NEXT: v_pk_add_f16 v3, v4, v2
-; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
+; GFX908-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3]
+; GFX908-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:2040 glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX908-NEXT: v_mov_b32_e32 v7, v5
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT: v_mov_b32_e32 v6, v4
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB42_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX908-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX908-NEXT: v_mov_b32_e32 v0, v3
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
-; GFX8-LABEL: global_agent_atomic_fadd_ret_v2f16:
+; GFX8-LABEL: global_agent_atomic_fadd_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: flat_load_dword v3, v[0:1]
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7f8, v0
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT: flat_load_dwordx2 v[6:7], v[0:1]
; GFX8-NEXT: s_mov_b64 s[4:5], 0
; GFX8-NEXT: .LBB42_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v4, v3
-; GFX8-NEXT: v_add_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_add_f16_e32 v5, v4, v2
-; GFX8-NEXT: v_or_b32_e32 v3, v5, v3
-; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3]
+; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX8-NEXT: v_mov_b32_e32 v7, v5
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT: v_mov_b32_e32 v6, v4
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB42_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX8-NEXT: v_mov_b32_e32 v0, v3
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX7-LABEL: global_agent_atomic_fadd_ret_v2f16:
+; GFX7-LABEL: global_agent_atomic_fadd_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: s_mov_b32 s6, 0
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
-; GFX7-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3
-; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2
+; GFX7-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:2040
; GFX7-NEXT: s_mov_b64 s[8:9], 0
-; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v5
-; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v5
-; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3
-; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v6
; GFX7-NEXT: .LBB42_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3
-; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v3
-; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v2
-; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX7-NEXT: v_add_f32_e32 v6, v6, v4
-; GFX7-NEXT: v_add_f32_e32 v7, v7, v5
-; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6
-; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7
-; GFX7-NEXT: v_or_b32_e32 v7, v2, v3
-; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v6
-; GFX7-NEXT: v_or_b32_e32 v6, v8, v2
-; GFX7-NEXT: v_mov_b32_e32 v9, v7
-; GFX7-NEXT: v_mov_b32_e32 v8, v6
-; GFX7-NEXT: buffer_atomic_cmpswap v[8:9], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3]
+; GFX7-NEXT: v_mov_b32_e32 v11, v7
+; GFX7-NEXT: v_mov_b32_e32 v10, v6
+; GFX7-NEXT: v_mov_b32_e32 v9, v5
+; GFX7-NEXT: v_mov_b32_e32 v8, v4
+; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 offset:2040 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v8
-; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v8
-; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7
+; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v6, v8
; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT: v_mov_b32_e32 v7, v9
; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_cbranch_execnz .LBB42_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
-; GFX7-NEXT: v_mov_b32_e32 v0, v2
-; GFX7-NEXT: v_mov_b32_e32 v1, v3
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
-; GFX6-LABEL: global_agent_atomic_fadd_ret_v2f16:
+; GFX6-LABEL: global_agent_atomic_fadd_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: s_mov_b32 s6, 0
; GFX6-NEXT: s_mov_b32 s7, 0xf000
; GFX6-NEXT: s_mov_b32 s4, s6
; GFX6-NEXT: s_mov_b32 s5, s6
-; GFX6-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64
-; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3
-; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v2
+; GFX6-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64 offset:2040
; GFX6-NEXT: s_mov_b64 s[8:9], 0
-; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v3
-; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v5
-; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v5
-; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3
-; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v6
; GFX6-NEXT: .LBB42_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3
-; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v3
-; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v2
-; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX6-NEXT: v_add_f32_e32 v6, v6, v4
-; GFX6-NEXT: v_add_f32_e32 v7, v7, v5
-; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3]
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v7
-; GFX6-NEXT: v_or_b32_e32 v7, v2, v3
-; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v6
-; GFX6-NEXT: v_or_b32_e32 v6, v8, v2
-; GFX6-NEXT: v_mov_b32_e32 v9, v7
-; GFX6-NEXT: v_mov_b32_e32 v8, v6
-; GFX6-NEXT: buffer_atomic_cmpswap v[8:9], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT: v_mov_b32_e32 v11, v7
+; GFX6-NEXT: v_mov_b32_e32 v10, v6
+; GFX6-NEXT: v_mov_b32_e32 v9, v5
+; GFX6-NEXT: v_mov_b32_e32 v8, v4
+; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 offset:2040 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v8
-; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v8
-; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7
+; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v6, v8
; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT: v_mov_b32_e32 v7, v9
; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_cbranch_execnz .LBB42_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
-; GFX6-NEXT: v_mov_b32_e32 v0, v2
-; GFX6-NEXT: v_mov_b32_e32 v1, v3
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
- %result = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %val syncscope("agent") seq_cst
- ret <2 x half> %result
+ %gep = getelementptr double, ptr addrspace(1) %ptr, i64 255
+ %unused = atomicrmw fadd ptr addrspace(1) %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+ ret void
}
-define <2 x half> @global_agent_atomic_fadd_ret_v2f16__offset12b_pos(ptr addrspace(1) %ptr, <2 x half> %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fadd_ret_v2f16__offset12b_pos:
+define void @global_agent_atomic_fadd_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, double %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fadd_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: s_wait_expcnt 0x0
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: global_load_b64 v[6:7], v[0:1], off offset:-2048
+; GFX12-NEXT: s_mov_b32 s0, 0
+; GFX12-NEXT: .LBB43_1: ; %atomicrmw.start
+; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: v_add_f64_e32 v[4:5], v[6:7], v[2:3]
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN
+; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:-2048 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
+; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_cbranch_execnz .LBB43_1
+; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
-; GFX940-LABEL: global_agent_atomic_fadd_ret_v2f16__offset12b_pos:
+; GFX940-LABEL: global_agent_atomic_fadd_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off offset:2044 sc0
+; GFX940-NEXT: global_atomic_add_f64 v[0:1], v[2:3], off offset:-2048
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc1
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: global_agent_atomic_fadd_ret_v2f16__offset12b_pos:
+; GFX11-LABEL: global_agent_atomic_fadd_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044
+; GFX11-NEXT: global_load_b64 v[6:7], v[0:1], off offset:-2048
; GFX11-NEXT: s_mov_b32 s0, 0
; GFX11-NEXT: .LBB43_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v4, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_pk_add_f16 v3, v4, v2
+; GFX11-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3]
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 glc
+; GFX11-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off offset:-2048 glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX11-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: s_cbranch_execnz .LBB43_1
; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: v_mov_b32_e32 v0, v3
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
-; GFX10-LABEL: global_agent_atomic_fadd_ret_v2f16__offset12b_pos:
+; GFX10-LABEL: global_agent_atomic_fadd_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044
+; GFX10-NEXT: global_load_dwordx2 v[6:7], v[0:1], off offset:-2048
; GFX10-NEXT: s_mov_b32 s4, 0
; GFX10-NEXT: .LBB43_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v4, v3
-; GFX10-NEXT: v_pk_add_f16 v3, v4, v2
+; GFX10-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
+; GFX10-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:-2048 glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX10-NEXT: v_mov_b32_e32 v7, v5
+; GFX10-NEXT: v_mov_b32_e32 v6, v4
; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_cbranch_execnz .LBB43_1
; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: v_mov_b32_e32 v0, v3
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX90A-LABEL: global_agent_atomic_fadd_ret_v2f16__offset12b_pos:
+; GFX90A-LABEL: global_agent_atomic_fadd_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off offset:2044 glc
+; GFX90A-NEXT: global_atomic_add_f64 v[0:1], v[2:3], off offset:-2048
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
-; GFX908-LABEL: global_agent_atomic_fadd_ret_v2f16__offset12b_pos:
+; GFX908-LABEL: global_agent_atomic_fadd_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044
+; GFX908-NEXT: global_load_dwordx2 v[6:7], v[0:1], off offset:-2048
; GFX908-NEXT: s_mov_b64 s[4:5], 0
; GFX908-NEXT: .LBB43_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v4, v3
-; GFX908-NEXT: v_pk_add_f16 v3, v4, v2
-; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
+; GFX908-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3]
+; GFX908-NEXT: global_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7], off offset:-2048 glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX908-NEXT: v_mov_b32_e32 v7, v5
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT: v_mov_b32_e32 v6, v4
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB43_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX908-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX908-NEXT: v_mov_b32_e32 v0, v3
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
-; GFX8-LABEL: global_agent_atomic_fadd_ret_v2f16__offset12b_pos:
+; GFX8-LABEL: global_agent_atomic_fadd_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fc, v0
-; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_load_dword v0, v[3:4]
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0xfffff800, v0
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc
+; GFX8-NEXT: flat_load_dwordx2 v[6:7], v[0:1]
; GFX8-NEXT: s_mov_b64 s[4:5], 0
; GFX8-NEXT: .LBB43_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, v0
-; GFX8-NEXT: v_add_f16_sdwa v0, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_add_f16_e32 v5, v1, v2
-; GFX8-NEXT: v_or_b32_e32 v0, v5, v0
-; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
+; GFX8-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3]
+; GFX8-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX8-NEXT: v_mov_b32_e32 v7, v5
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT: v_mov_b32_e32 v6, v4
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB43_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX7-LABEL: global_agent_atomic_fadd_ret_v2f16__offset12b_pos:
+; GFX7-LABEL: global_agent_atomic_fadd_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: s_mov_b32 s6, 0
+; GFX7-NEXT: s_movk_i32 s4, 0xf800
+; GFX7-NEXT: s_mov_b32 s5, -1
; GFX7-NEXT: s_mov_b32 s7, 0xf000
+; GFX7-NEXT: s_mov_b32 s6, 0
+; GFX7-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0
+; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc
+; GFX7-NEXT: s_mov_b64 s[8:9], 0
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
-; GFX7-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:2044
-; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3
-; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2
-; GFX7-NEXT: s_mov_b64 s[8:9], 0
-; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v5
-; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v5
-; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3
-; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v6
; GFX7-NEXT: .LBB43_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3
-; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v3
-; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v2
-; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX7-NEXT: v_add_f32_e32 v6, v6, v4
-; GFX7-NEXT: v_add_f32_e32 v7, v7, v5
-; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6
-; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7
-; GFX7-NEXT: v_or_b32_e32 v7, v2, v3
-; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v6
-; GFX7-NEXT: v_or_b32_e32 v6, v8, v2
-; GFX7-NEXT: v_mov_b32_e32 v9, v7
-; GFX7-NEXT: v_mov_b32_e32 v8, v6
-; GFX7-NEXT: buffer_atomic_cmpswap v[8:9], v[0:1], s[4:7], 0 addr64 offset:2044 glc
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3]
+; GFX7-NEXT: v_mov_b32_e32 v11, v7
+; GFX7-NEXT: v_mov_b32_e32 v10, v6
+; GFX7-NEXT: v_mov_b32_e32 v9, v5
+; GFX7-NEXT: v_mov_b32_e32 v8, v4
+; GFX7-NEXT: buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v8
-; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v8
-; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7
+; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7]
+; GFX7-NEXT: v_mov_b32_e32 v6, v8
; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT: v_mov_b32_e32 v7, v9
; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_cbranch_execnz .LBB43_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
-; GFX7-NEXT: v_mov_b32_e32 v0, v2
-; GFX7-NEXT: v_mov_b32_e32 v1, v3
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
-; GFX6-LABEL: global_agent_atomic_fadd_ret_v2f16__offset12b_pos:
+; GFX6-LABEL: global_agent_atomic_fadd_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT: s_mov_b32 s6, 0
+; GFX6-NEXT: s_movk_i32 s4, 0xf800
+; GFX6-NEXT: s_mov_b32 s5, -1
; GFX6-NEXT: s_mov_b32 s7, 0xf000
+; GFX6-NEXT: s_mov_b32 s6, 0
+; GFX6-NEXT: buffer_load_dwordx2 v[6:7], v[0:1], s[4:7], 0 addr64
+; GFX6-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0
+; GFX6-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc
+; GFX6-NEXT: s_mov_b64 s[8:9], 0
; GFX6-NEXT: s_mov_b32 s4, s6
; GFX6-NEXT: s_mov_b32 s5, s6
-; GFX6-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:2044
-; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3
-; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v2
-; GFX6-NEXT: s_mov_b64 s[8:9], 0
-; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v3
-; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v5
-; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v5
-; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3
-; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v6
; GFX6-NEXT: .LBB43_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3
-; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v3
-; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v2
-; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX6-NEXT: v_add_f32_e32 v6, v6, v4
-; GFX6-NEXT: v_add_f32_e32 v7, v7, v5
-; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_add_f64 v[4:5], v[6:7], v[2:3]
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v7
-; GFX6-NEXT: v_or_b32_e32 v7, v2, v3
-; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v6
-; GFX6-NEXT: v_or_b32_e32 v6, v8, v2
-; GFX6-NEXT: v_mov_b32_e32 v9, v7
-; GFX6-NEXT: v_mov_b32_e32 v8, v6
-; GFX6-NEXT: buffer_atomic_cmpswap v[8:9], v[0:1], s[4:7], 0 addr64 offset:2044 glc
+; GFX6-NEXT: v_mov_b32_e32 v11, v7
+; GFX6-NEXT: v_mov_b32_e32 v10, v6
+; GFX6-NEXT: v_mov_b32_e32 v9, v5
+; GFX6-NEXT: v_mov_b32_e32 v8, v4
+; GFX6-NEXT: buffer_atomic_cmpswap_x2 v[8:11], v[0:1], s[4:7], 0 addr64 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v8
-; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v8
-; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7
+; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[6:7]
+; GFX6-NEXT: v_mov_b32_e32 v6, v8
; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT: v_mov_b32_e32 v7, v9
; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_cbranch_execnz .LBB43_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
-; GFX6-NEXT: v_mov_b32_e32 v0, v2
-; GFX6-NEXT: v_mov_b32_e32 v1, v3
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
- %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i64 511
- %result = atomicrmw fadd ptr addrspace(1) %gep, <2 x half> %val syncscope("agent") seq_cst
- ret <2 x half> %result
+ %gep = getelementptr double, ptr addrspace(1) %ptr, i64 -256
+ %unused = atomicrmw fadd ptr addrspace(1) %gep, double %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+ ret void
}
-define <2 x half> @global_agent_atomic_fadd_ret_v2f16__offset12b_neg(ptr addrspace(1) %ptr, <2 x half> %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fadd_ret_v2f16__offset12b_neg:
+; --------------------------------------------------------------------
+; half
+; --------------------------------------------------------------------
+
+define half @global_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: s_wait_expcnt 0x0
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_mov_b32_e32 v3, v0
+; GFX12-NEXT: s_mov_b32 s0, 0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX12-NEXT: global_load_b32 v5, v[0:1], off
+; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_not_b32_e32 v4, v4
+; GFX12-NEXT: .LBB44_1: ; %atomicrmw.start
+; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: v_mov_b32_e32 v6, v5
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX12-NEXT: v_add_f16_e32 v5, v5, v2
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off offset:-2048 th:TH_ATOMIC_RETURN
+; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_cbranch_execnz .LBB44_1
+; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
-; GFX940-LABEL: global_agent_atomic_fadd_ret_v2f16__offset12b_neg:
+; GFX940-LABEL: global_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT: v_mov_b32_e32 v3, v0
+; GFX940-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX940-NEXT: global_load_dword v4, v[0:1], off
+; GFX940-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX940-NEXT: s_mov_b32 s0, 0xffff
+; GFX940-NEXT: v_lshlrev_b32_e64 v5, v3, s0
+; GFX940-NEXT: v_not_b32_e32 v5, v5
+; GFX940-NEXT: s_mov_b64 s[0:1], 0
+; GFX940-NEXT: .LBB44_1: ; %atomicrmw.start
+; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX940-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NEXT: v_mov_b32_e32 v7, v4
+; GFX940-NEXT: v_lshrrev_b32_e32 v4, v3, v7
+; GFX940-NEXT: v_add_f16_e32 v4, v4, v2
+; GFX940-NEXT: v_lshlrev_b32_e32 v4, v3, v4
+; GFX940-NEXT: v_and_or_b32 v6, v7, v5, v4
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off offset:-2048 sc0
+; GFX940-NEXT: global_atomic_cmpswap v4, v[0:1], v[6:7], off sc0
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc1
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7
+; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX940-NEXT: s_cbranch_execnz .LBB44_1
+; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX940-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v4
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: global_agent_atomic_fadd_ret_v2f16__offset12b_neg:
+; GFX11-LABEL: global_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:-2048
+; GFX11-NEXT: v_mov_b32_e32 v3, v0
; GFX11-NEXT: s_mov_b32 s0, 0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX11-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX11-NEXT: global_load_b32 v5, v[0:1], off
+; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_not_b32_e32 v4, v4
; GFX11-NEXT: .LBB44_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-NEXT: v_mov_b32_e32 v6, v5
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX11-NEXT: v_add_f16_e32 v5, v5, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_pk_add_f16 v3, v4, v2
+; GFX11-NEXT: v_and_or_b32 v5, v6, v4, v5
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:-2048 glc
+; GFX11-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: s_cbranch_execnz .LBB44_1
; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: v_mov_b32_e32 v0, v3
+; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
-; GFX10-LABEL: global_agent_atomic_fadd_ret_v2f16__offset12b_neg:
+; GFX10-LABEL: global_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:-2048
+; GFX10-NEXT: v_mov_b32_e32 v3, v0
; GFX10-NEXT: s_mov_b32 s4, 0
+; GFX10-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX10-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX10-NEXT: global_load_dword v5, v[0:1], off
+; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX10-NEXT: v_not_b32_e32 v4, v4
; GFX10-NEXT: .LBB44_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v4, v3
-; GFX10-NEXT: v_pk_add_f16 v3, v4, v2
+; GFX10-NEXT: v_mov_b32_e32 v6, v5
+; GFX10-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX10-NEXT: v_add_f16_e32 v5, v5, v2
+; GFX10-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX10-NEXT: v_and_or_b32 v5, v6, v4, v5
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc
+; GFX10-NEXT: global_atomic_cmpswap v5, v[0:1], v[5:6], off glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_cbranch_execnz .LBB44_1
; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: v_mov_b32_e32 v0, v3
+; GFX10-NEXT: v_lshrrev_b32_e32 v0, v3, v5
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX90A-LABEL: global_agent_atomic_fadd_ret_v2f16__offset12b_neg:
+; GFX90A-LABEL: global_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off offset:-2048 glc
+; GFX90A-NEXT: v_mov_b32_e32 v3, v0
+; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX90A-NEXT: global_load_dword v4, v[0:1], off
+; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX90A-NEXT: s_mov_b32 s4, 0xffff
+; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v3, s4
+; GFX90A-NEXT: v_not_b32_e32 v5, v5
+; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: .LBB44_1: ; %atomicrmw.start
+; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v7, v4
+; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v7
+; GFX90A-NEXT: v_add_f16_e32 v4, v4, v2
+; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4
+; GFX90A-NEXT: v_and_or_b32 v6, v7, v5, v4
+; GFX90A-NEXT: global_atomic_cmpswap v4, v[0:1], v[6:7], off glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7
+; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: s_cbranch_execnz .LBB44_1
+; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v4
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
-; GFX908-LABEL: global_agent_atomic_fadd_ret_v2f16__offset12b_neg:
+; GFX908-LABEL: global_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:-2048
+; GFX908-NEXT: v_mov_b32_e32 v3, v0
+; GFX908-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX908-NEXT: global_load_dword v4, v[0:1], off
+; GFX908-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX908-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX908-NEXT: s_mov_b32 s4, 0xffff
+; GFX908-NEXT: v_lshlrev_b32_e64 v5, v3, s4
+; GFX908-NEXT: v_not_b32_e32 v5, v5
; GFX908-NEXT: s_mov_b64 s[4:5], 0
; GFX908-NEXT: .LBB44_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v4, v3
-; GFX908-NEXT: v_pk_add_f16 v3, v4, v2
-; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc
+; GFX908-NEXT: v_mov_b32_e32 v7, v4
+; GFX908-NEXT: v_lshrrev_b32_e32 v4, v3, v7
+; GFX908-NEXT: v_add_f16_e32 v4, v4, v2
+; GFX908-NEXT: v_lshlrev_b32_e32 v4, v3, v4
+; GFX908-NEXT: v_and_or_b32 v6, v7, v5, v4
+; GFX908-NEXT: global_atomic_cmpswap v4, v[0:1], v[6:7], off glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB44_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX908-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX908-NEXT: v_mov_b32_e32 v0, v3
+; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v4
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
-; GFX8-LABEL: global_agent_atomic_fadd_ret_v2f16__offset12b_neg:
+; GFX8-LABEL: global_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0xfffff800, v0
-; GFX8-NEXT: v_addc_u32_e32 v4, vcc, -1, v1, vcc
-; GFX8-NEXT: flat_load_dword v0, v[3:4]
+; GFX8-NEXT: v_mov_b32_e32 v3, v0
+; GFX8-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX8-NEXT: flat_load_dword v5, v[0:1]
+; GFX8-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX8-NEXT: s_mov_b32 s4, 0xffff
+; GFX8-NEXT: v_lshlrev_b32_e64 v4, v3, s4
+; GFX8-NEXT: v_not_b32_e32 v4, v4
; GFX8-NEXT: s_mov_b64 s[4:5], 0
; GFX8-NEXT: .LBB44_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, v0
-; GFX8-NEXT: v_add_f16_sdwa v0, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_add_f16_e32 v5, v1, v2
-; GFX8-NEXT: v_or_b32_e32 v0, v5, v0
-; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
+; GFX8-NEXT: v_mov_b32_e32 v6, v5
+; GFX8-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX8-NEXT: v_add_f16_e32 v5, v5, v2
+; GFX8-NEXT: v_and_b32_e32 v7, v6, v4
+; GFX8-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX8-NEXT: v_or_b32_e32 v5, v7, v5
+; GFX8-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB44_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT: v_lshrrev_b32_e32 v0, v3, v5
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX7-LABEL: global_agent_atomic_fadd_ret_v2f16__offset12b_neg:
+; GFX7-LABEL: global_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: s_movk_i32 s4, 0xf800
-; GFX7-NEXT: s_mov_b32 s5, -1
-; GFX7-NEXT: s_mov_b32 s7, 0xf000
+; GFX7-NEXT: v_mov_b32_e32 v3, v0
; GFX7-NEXT: s_mov_b32 s6, 0
-; GFX7-NEXT: buffer_load_dword v6, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0xfffff800, v0
-; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v3
-; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v2
-; GFX7-NEXT: v_addc_u32_e32 v5, vcc, -1, v1, vcc
-; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v0
-; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3
-; GFX7-NEXT: s_mov_b64 s[8:9], 0
+; GFX7-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v6
-; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v6
-; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX7-NEXT: v_lshlrev_b32_e32 v5, 3, v3
+; GFX7-NEXT: s_mov_b64 s[8:9], 0
+; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v2
+; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v5
+; GFX7-NEXT: v_not_b32_e32 v7, v2
; GFX7-NEXT: .LBB44_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v1
-; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v0
-; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX7-NEXT: v_add_f32_e32 v6, v6, v2
-; GFX7-NEXT: v_add_f32_e32 v7, v7, v3
-; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6
-; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7
-; GFX7-NEXT: v_or_b32_e32 v7, v0, v1
-; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v6
-; GFX7-NEXT: v_or_b32_e32 v6, v8, v0
-; GFX7-NEXT: v_mov_b32_e32 v9, v7
-; GFX7-NEXT: v_mov_b32_e32 v8, v6
-; GFX7-NEXT: buffer_atomic_cmpswap v[8:9], v[4:5], s[4:7], 0 addr64 glc
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_lshrrev_b32_e32 v2, v5, v4
+; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GFX7-NEXT: v_and_b32_e32 v3, v4, v7
+; GFX7-NEXT: v_add_f32_e32 v2, v2, v6
+; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, v5, v2
+; GFX7-NEXT: v_or_b32_e32 v3, v3, v2
+; GFX7-NEXT: v_mov_b32_e32 v2, v3
+; GFX7-NEXT: v_mov_b32_e32 v3, v4
+; GFX7-NEXT: buffer_atomic_cmpswap v[2:3], v[0:1], s[4:7], 0 addr64 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v8
-; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v8
-; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4
; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT: v_mov_b32_e32 v4, v2
; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_cbranch_execnz .LBB44_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, v5, v2
+; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
-; GFX6-LABEL: global_agent_atomic_fadd_ret_v2f16__offset12b_neg:
+; GFX6-LABEL: global_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT: s_movk_i32 s4, 0xf800
-; GFX6-NEXT: s_mov_b32 s5, -1
-; GFX6-NEXT: s_mov_b32 s7, 0xf000
+; GFX6-NEXT: v_mov_b32_e32 v3, v0
; GFX6-NEXT: s_mov_b32 s6, 0
-; GFX6-NEXT: buffer_load_dword v6, v[0:1], s[4:7], 0 addr64
-; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0xfffff800, v0
-; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v3
-; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v2
-; GFX6-NEXT: v_addc_u32_e32 v5, vcc, -1, v1, vcc
-; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v0
-; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3
-; GFX6-NEXT: s_mov_b64 s[8:9], 0
+; GFX6-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX6-NEXT: s_mov_b32 s7, 0xf000
; GFX6-NEXT: s_mov_b32 s4, s6
; GFX6-NEXT: s_mov_b32 s5, s6
-; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v6
-; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v6
-; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64
+; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX6-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX6-NEXT: v_lshlrev_b32_e32 v5, 3, v3
+; GFX6-NEXT: s_mov_b64 s[8:9], 0
+; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v2
+; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v5
+; GFX6-NEXT: v_not_b32_e32 v7, v2
; GFX6-NEXT: .LBB44_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v1
-; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v0
-; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX6-NEXT: v_add_f32_e32 v6, v6, v2
-; GFX6-NEXT: v_add_f32_e32 v7, v7, v3
-; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6
-; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v7
-; GFX6-NEXT: v_or_b32_e32 v7, v0, v1
-; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v6
-; GFX6-NEXT: v_or_b32_e32 v6, v8, v0
-; GFX6-NEXT: v_mov_b32_e32 v9, v7
-; GFX6-NEXT: v_mov_b32_e32 v8, v6
-; GFX6-NEXT: buffer_atomic_cmpswap v[8:9], v[4:5], s[4:7], 0 addr64 glc
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-NEXT: v_lshrrev_b32_e32 v2, v5, v4
+; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GFX6-NEXT: v_and_b32_e32 v3, v4, v7
+; GFX6-NEXT: v_add_f32_e32 v2, v2, v6
+; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX6-NEXT: v_lshlrev_b32_e32 v2, v5, v2
+; GFX6-NEXT: v_or_b32_e32 v3, v3, v2
+; GFX6-NEXT: v_mov_b32_e32 v2, v3
+; GFX6-NEXT: v_mov_b32_e32 v3, v4
+; GFX6-NEXT: buffer_atomic_cmpswap v[2:3], v[0:1], s[4:7], 0 addr64 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v8
-; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v8
-; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4
; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT: v_mov_b32_e32 v4, v2
; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_cbranch_execnz .LBB44_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
+; GFX6-NEXT: v_lshrrev_b32_e32 v0, v5, v2
+; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
- %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i64 -512
- %result = atomicrmw fadd ptr addrspace(1) %gep, <2 x half> %val syncscope("agent") seq_cst
- ret <2 x half> %result
+ %result = atomicrmw fadd ptr addrspace(1) %ptr, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+ ret half %result
}
-define void @global_agent_atomic_fadd_noret_v2f16(ptr addrspace(1) %ptr, <2 x half> %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fadd_noret_v2f16:
+define half @global_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: s_wait_expcnt 0x0
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX12-NEXT: s_mov_b32 s0, 0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX12-NEXT: global_load_b32 v5, v[0:1], off
+; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_not_b32_e32 v4, v4
+; GFX12-NEXT: .LBB45_1: ; %atomicrmw.start
+; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: v_mov_b32_e32 v6, v5
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX12-NEXT: v_add_f16_e32 v5, v5, v2
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off
-; GFX12-NEXT: s_wait_storecnt 0x0
+; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN
+; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_cbranch_execnz .LBB45_1
+; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
-; GFX940-LABEL: global_agent_atomic_fadd_noret_v2f16:
+; GFX940-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT: s_mov_b64 s[0:1], 0x7fe
+; GFX940-NEXT: v_lshl_add_u64 v[6:7], v[0:1], 0, s[0:1]
+; GFX940-NEXT: v_and_b32_e32 v0, -4, v6
+; GFX940-NEXT: v_mov_b32_e32 v1, v7
+; GFX940-NEXT: global_load_dword v4, v[0:1], off
+; GFX940-NEXT: v_and_b32_e32 v3, 3, v6
+; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX940-NEXT: s_mov_b32 s0, 0xffff
+; GFX940-NEXT: v_lshlrev_b32_e64 v5, v3, s0
+; GFX940-NEXT: v_not_b32_e32 v5, v5
+; GFX940-NEXT: s_mov_b64 s[0:1], 0
+; GFX940-NEXT: .LBB45_1: ; %atomicrmw.start
+; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX940-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NEXT: v_mov_b32_e32 v7, v4
+; GFX940-NEXT: v_lshrrev_b32_e32 v4, v3, v7
+; GFX940-NEXT: v_add_f16_e32 v4, v4, v2
+; GFX940-NEXT: v_lshlrev_b32_e32 v4, v3, v4
+; GFX940-NEXT: v_and_or_b32 v6, v7, v5, v4
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off
+; GFX940-NEXT: global_atomic_cmpswap v4, v[0:1], v[6:7], off sc0
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc1
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7
+; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX940-NEXT: s_cbranch_execnz .LBB45_1
+; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX940-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v4
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: global_agent_atomic_fadd_noret_v2f16:
+; GFX11-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v4, v[0:1], off
+; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX11-NEXT: s_mov_b32 s0, 0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX11-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX11-NEXT: global_load_b32 v5, v[0:1], off
+; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_not_b32_e32 v4, v4
; GFX11-NEXT: .LBB45_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_pk_add_f16 v3, v4, v2
+; GFX11-NEXT: v_mov_b32_e32 v6, v5
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX11-NEXT: v_add_f16_e32 v5, v5, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_and_or_b32 v5, v6, v4, v5
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc
+; GFX11-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: s_cbranch_execnz .LBB45_1
; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
-; GFX10-LABEL: global_agent_atomic_fadd_noret_v2f16:
+; GFX10-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v4, v[0:1], off
+; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX10-NEXT: s_mov_b32 s4, 0
+; GFX10-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX10-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX10-NEXT: global_load_dword v5, v[0:1], off
+; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX10-NEXT: v_not_b32_e32 v4, v4
; GFX10-NEXT: .LBB45_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_pk_add_f16 v3, v4, v2
+; GFX10-NEXT: v_mov_b32_e32 v6, v5
+; GFX10-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX10-NEXT: v_add_f16_e32 v5, v5, v2
+; GFX10-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX10-NEXT: v_and_or_b32 v5, v6, v4, v5
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
+; GFX10-NEXT: global_atomic_cmpswap v5, v[0:1], v[5:6], off glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX10-NEXT: v_mov_b32_e32 v4, v3
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_cbranch_execnz .LBB45_1
; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT: v_lshrrev_b32_e32 v0, v3, v5
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX90A-LABEL: global_agent_atomic_fadd_noret_v2f16:
+; GFX90A-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off
+; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0
+; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX90A-NEXT: global_load_dword v4, v[0:1], off
+; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX90A-NEXT: s_mov_b32 s4, 0xffff
+; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v3, s4
+; GFX90A-NEXT: v_not_b32_e32 v5, v5
+; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: .LBB45_1: ; %atomicrmw.start
+; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v7, v4
+; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v7
+; GFX90A-NEXT: v_add_f16_e32 v4, v4, v2
+; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4
+; GFX90A-NEXT: v_and_or_b32 v6, v7, v5, v4
+; GFX90A-NEXT: global_atomic_cmpswap v4, v[0:1], v[6:7], off glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7
+; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: s_cbranch_execnz .LBB45_1
+; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v4
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
-; GFX908-LABEL: global_agent_atomic_fadd_noret_v2f16:
+; GFX908-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0
+; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX908-NEXT: v_and_b32_e32 v0, -4, v3
; GFX908-NEXT: global_load_dword v4, v[0:1], off
+; GFX908-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX908-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX908-NEXT: s_mov_b32 s4, 0xffff
+; GFX908-NEXT: v_lshlrev_b32_e64 v5, v3, s4
+; GFX908-NEXT: v_not_b32_e32 v5, v5
; GFX908-NEXT: s_mov_b64 s[4:5], 0
; GFX908-NEXT: .LBB45_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_pk_add_f16 v3, v4, v2
-; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
+; GFX908-NEXT: v_mov_b32_e32 v7, v4
+; GFX908-NEXT: v_lshrrev_b32_e32 v4, v3, v7
+; GFX908-NEXT: v_add_f16_e32 v4, v4, v2
+; GFX908-NEXT: v_lshlrev_b32_e32 v4, v3, v4
+; GFX908-NEXT: v_and_or_b32 v6, v7, v5, v4
+; GFX908-NEXT: global_atomic_cmpswap v4, v[0:1], v[6:7], off glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT: v_mov_b32_e32 v4, v3
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB45_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX908-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v4
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
-; GFX8-LABEL: global_agent_atomic_fadd_noret_v2f16:
+; GFX8-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: flat_load_dword v4, v[0:1]
+; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fe, v0
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX8-NEXT: flat_load_dword v5, v[0:1]
+; GFX8-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX8-NEXT: s_mov_b32 s4, 0xffff
+; GFX8-NEXT: v_lshlrev_b32_e64 v4, v3, s4
+; GFX8-NEXT: v_not_b32_e32 v4, v4
; GFX8-NEXT: s_mov_b64 s[4:5], 0
; GFX8-NEXT: .LBB45_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_add_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_add_f16_e32 v5, v4, v2
-; GFX8-NEXT: v_or_b32_e32 v3, v5, v3
-; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
-; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
-; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT: v_mov_b32_e32 v4, v3
-; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX8-NEXT: v_mov_b32_e32 v6, v5
+; GFX8-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX8-NEXT: v_add_f16_e32 v5, v5, v2
+; GFX8-NEXT: v_and_b32_e32 v7, v6, v4
+; GFX8-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX8-NEXT: v_or_b32_e32 v5, v7, v5
+; GFX8-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: buffer_wbinvl1
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6
+; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB45_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT: v_lshrrev_b32_e32 v0, v3, v5
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX7-LABEL: global_agent_atomic_fadd_noret_v2f16:
+; GFX7-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0
; GFX7-NEXT: s_mov_b32 s6, 0
+; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX7-NEXT: v_and_b32_e32 v0, -4, v4
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
-; GFX7-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3
-; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2
+; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v6, 3, v4
; GFX7-NEXT: s_mov_b64 s[8:9], 0
-; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v3
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v5
-; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v5
-; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3
-; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6
+; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v2
+; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v6
+; GFX7-NEXT: v_not_b32_e32 v8, v2
; GFX7-NEXT: .LBB45_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5
-; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4
-; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5
-; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4
-; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX7-NEXT: v_add_f32_e32 v6, v6, v2
-; GFX7-NEXT: v_add_f32_e32 v7, v7, v3
-; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6
-; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7
-; GFX7-NEXT: v_or_b32_e32 v6, v4, v5
-; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8
-; GFX7-NEXT: v_or_b32_e32 v5, v7, v4
-; GFX7-NEXT: v_mov_b32_e32 v8, v6
-; GFX7-NEXT: v_mov_b32_e32 v7, v5
-; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_lshrrev_b32_e32 v2, v6, v3
+; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GFX7-NEXT: v_and_b32_e32 v4, v3, v8
+; GFX7-NEXT: v_add_f32_e32 v2, v2, v7
+; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, v6, v2
+; GFX7-NEXT: v_or_b32_e32 v2, v4, v2
+; GFX7-NEXT: v_mov_b32_e32 v5, v3
+; GFX7-NEXT: v_mov_b32_e32 v4, v2
+; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7
-; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7
-; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT: v_mov_b32_e32 v3, v4
; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_cbranch_execnz .LBB45_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, v6, v4
+; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
-; GFX6-LABEL: global_agent_atomic_fadd_noret_v2f16:
+; GFX6-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0
; GFX6-NEXT: s_mov_b32 s6, 0
+; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX6-NEXT: v_and_b32_e32 v0, -4, v4
; GFX6-NEXT: s_mov_b32 s7, 0xf000
; GFX6-NEXT: s_mov_b32 s4, s6
; GFX6-NEXT: s_mov_b32 s5, s6
-; GFX6-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64
-; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3
-; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v2
+; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
+; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX6-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX6-NEXT: v_lshlrev_b32_e32 v6, 3, v4
; GFX6-NEXT: s_mov_b64 s[8:9], 0
-; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v3
-; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v5
-; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v5
-; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v3
-; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6
+; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v2
+; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v6
+; GFX6-NEXT: v_not_b32_e32 v8, v2
; GFX6-NEXT: .LBB45_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5
-; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4
-; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v5
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_lshrrev_b32_e32 v2, v6, v3
+; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v4
-; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX6-NEXT: v_add_f32_e32 v6, v6, v2
-; GFX6-NEXT: v_add_f32_e32 v7, v7, v3
-; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v6
-; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v7
-; GFX6-NEXT: v_or_b32_e32 v6, v4, v5
-; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v8
-; GFX6-NEXT: v_or_b32_e32 v5, v7, v4
-; GFX6-NEXT: v_mov_b32_e32 v8, v6
-; GFX6-NEXT: v_mov_b32_e32 v7, v5
-; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT: v_and_b32_e32 v4, v3, v8
+; GFX6-NEXT: v_add_f32_e32 v2, v2, v7
+; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX6-NEXT: v_lshlrev_b32_e32 v2, v6, v2
+; GFX6-NEXT: v_or_b32_e32 v2, v4, v2
+; GFX6-NEXT: v_mov_b32_e32 v5, v3
+; GFX6-NEXT: v_mov_b32_e32 v4, v2
+; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v7
-; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v7
-; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT: v_mov_b32_e32 v3, v4
; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_cbranch_execnz .LBB45_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
+; GFX6-NEXT: v_lshrrev_b32_e32 v0, v6, v4
+; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
- %unused = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %val syncscope("agent") seq_cst
- ret void
+ %gep = getelementptr half, ptr addrspace(1) %ptr, i64 1023
+ %result = atomicrmw fadd ptr addrspace(1) %gep, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+ ret half %result
}
-define void @global_agent_atomic_fadd_noret_v2f16__offset12b_pos(ptr addrspace(1) %ptr, <2 x half> %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fadd_noret_v2f16__offset12b_pos:
+define half @global_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: s_wait_expcnt 0x0
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
+; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX12-NEXT: s_mov_b32 s0, 0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX12-NEXT: global_load_b32 v5, v[0:1], off
+; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_not_b32_e32 v4, v4
+; GFX12-NEXT: .LBB46_1: ; %atomicrmw.start
+; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: v_mov_b32_e32 v6, v5
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX12-NEXT: v_add_f16_e32 v5, v5, v2
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off offset:2044
-; GFX12-NEXT: s_wait_storecnt 0x0
+; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN
+; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_cbranch_execnz .LBB46_1
+; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
-; GFX940-LABEL: global_agent_atomic_fadd_noret_v2f16__offset12b_pos:
+; GFX940-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT: s_movk_i32 s0, 0xf800
+; GFX940-NEXT: s_mov_b32 s1, -1
+; GFX940-NEXT: v_lshl_add_u64 v[6:7], v[0:1], 0, s[0:1]
+; GFX940-NEXT: v_and_b32_e32 v0, -4, v6
+; GFX940-NEXT: v_mov_b32_e32 v1, v7
+; GFX940-NEXT: global_load_dword v4, v[0:1], off
+; GFX940-NEXT: v_and_b32_e32 v3, 3, v6
+; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX940-NEXT: s_mov_b32 s0, 0xffff
+; GFX940-NEXT: v_lshlrev_b32_e64 v5, v3, s0
+; GFX940-NEXT: v_not_b32_e32 v5, v5
+; GFX940-NEXT: s_mov_b64 s[0:1], 0
+; GFX940-NEXT: .LBB46_1: ; %atomicrmw.start
+; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX940-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NEXT: v_mov_b32_e32 v7, v4
+; GFX940-NEXT: v_lshrrev_b32_e32 v4, v3, v7
+; GFX940-NEXT: v_add_f16_e32 v4, v4, v2
+; GFX940-NEXT: v_lshlrev_b32_e32 v4, v3, v4
+; GFX940-NEXT: v_and_or_b32 v6, v7, v5, v4
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off offset:2044
+; GFX940-NEXT: global_atomic_cmpswap v4, v[0:1], v[6:7], off sc0
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc1
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7
+; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX940-NEXT: s_cbranch_execnz .LBB46_1
+; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX940-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v4
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: global_agent_atomic_fadd_noret_v2f16__offset12b_pos:
+; GFX11-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v4, v[0:1], off offset:2044
+; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
+; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
; GFX11-NEXT: s_mov_b32 s0, 0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX11-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX11-NEXT: global_load_b32 v5, v[0:1], off
+; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_not_b32_e32 v4, v4
; GFX11-NEXT: .LBB46_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_pk_add_f16 v3, v4, v2
+; GFX11-NEXT: v_mov_b32_e32 v6, v5
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX11-NEXT: v_add_f16_e32 v5, v5, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_and_or_b32 v5, v6, v4, v5
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 glc
+; GFX11-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: s_cbranch_execnz .LBB46_1
; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
-; GFX10-LABEL: global_agent_atomic_fadd_noret_v2f16__offset12b_pos:
+; GFX10-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:2044
+; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
+; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
; GFX10-NEXT: s_mov_b32 s4, 0
+; GFX10-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX10-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX10-NEXT: global_load_dword v5, v[0:1], off
+; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX10-NEXT: v_not_b32_e32 v4, v4
; GFX10-NEXT: .LBB46_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_pk_add_f16 v3, v4, v2
+; GFX10-NEXT: v_mov_b32_e32 v6, v5
+; GFX10-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX10-NEXT: v_add_f16_e32 v5, v5, v2
+; GFX10-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX10-NEXT: v_and_or_b32 v5, v6, v4, v5
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
+; GFX10-NEXT: global_atomic_cmpswap v5, v[0:1], v[5:6], off glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX10-NEXT: v_mov_b32_e32 v4, v3
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_cbranch_execnz .LBB46_1
; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT: v_lshrrev_b32_e32 v0, v3, v5
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX90A-LABEL: global_agent_atomic_fadd_noret_v2f16__offset12b_pos:
+; GFX90A-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off offset:2044
+; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
+; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX90A-NEXT: global_load_dword v4, v[0:1], off
+; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX90A-NEXT: s_mov_b32 s4, 0xffff
+; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v3, s4
+; GFX90A-NEXT: v_not_b32_e32 v5, v5
+; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: .LBB46_1: ; %atomicrmw.start
+; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v7, v4
+; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v7
+; GFX90A-NEXT: v_add_f16_e32 v4, v4, v2
+; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4
+; GFX90A-NEXT: v_and_or_b32 v6, v7, v5, v4
+; GFX90A-NEXT: global_atomic_cmpswap v4, v[0:1], v[6:7], off glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7
+; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: s_cbranch_execnz .LBB46_1
+; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v4
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
-; GFX908-LABEL: global_agent_atomic_fadd_noret_v2f16__offset12b_pos:
+; GFX908-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: global_load_dword v4, v[0:1], off offset:2044
+; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
+; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX908-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX908-NEXT: global_load_dword v4, v[0:1], off
+; GFX908-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX908-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX908-NEXT: s_mov_b32 s4, 0xffff
+; GFX908-NEXT: v_lshlrev_b32_e64 v5, v3, s4
+; GFX908-NEXT: v_not_b32_e32 v5, v5
; GFX908-NEXT: s_mov_b64 s[4:5], 0
; GFX908-NEXT: .LBB46_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_pk_add_f16 v3, v4, v2
-; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
+; GFX908-NEXT: v_mov_b32_e32 v7, v4
+; GFX908-NEXT: v_lshrrev_b32_e32 v4, v3, v7
+; GFX908-NEXT: v_add_f16_e32 v4, v4, v2
+; GFX908-NEXT: v_lshlrev_b32_e32 v4, v3, v4
+; GFX908-NEXT: v_and_or_b32 v6, v7, v5, v4
+; GFX908-NEXT: global_atomic_cmpswap v4, v[0:1], v[6:7], off glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT: v_mov_b32_e32 v4, v3
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB46_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX908-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v4
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
-; GFX8-LABEL: global_agent_atomic_fadd_noret_v2f16__offset12b_pos:
+; GFX8-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0
-; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_load_dword v4, v[0:1]
+; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0xfffff800, v0
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc
+; GFX8-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX8-NEXT: flat_load_dword v5, v[0:1]
+; GFX8-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX8-NEXT: s_mov_b32 s4, 0xffff
+; GFX8-NEXT: v_lshlrev_b32_e64 v4, v3, s4
+; GFX8-NEXT: v_not_b32_e32 v4, v4
; GFX8-NEXT: s_mov_b64 s[4:5], 0
; GFX8-NEXT: .LBB46_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_add_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_add_f16_e32 v5, v4, v2
-; GFX8-NEXT: v_or_b32_e32 v3, v5, v3
-; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT: v_mov_b32_e32 v6, v5
+; GFX8-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX8-NEXT: v_add_f16_e32 v5, v5, v2
+; GFX8-NEXT: v_and_b32_e32 v7, v6, v4
+; GFX8-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX8-NEXT: v_or_b32_e32 v5, v7, v5
+; GFX8-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT: v_mov_b32_e32 v4, v3
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB46_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT: v_lshrrev_b32_e32 v0, v3, v5
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX7-LABEL: global_agent_atomic_fadd_noret_v2f16__offset12b_pos:
+; GFX7-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0xfffff800, v0
; GFX7-NEXT: s_mov_b32 s6, 0
+; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc
+; GFX7-NEXT: v_and_b32_e32 v0, -4, v4
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
-; GFX7-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:2044
-; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3
-; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2
+; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v6, 3, v4
; GFX7-NEXT: s_mov_b64 s[8:9], 0
-; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v3
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v5
-; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v5
-; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3
-; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6
+; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v2
+; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v6
+; GFX7-NEXT: v_not_b32_e32 v8, v2
; GFX7-NEXT: .LBB46_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5
-; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4
-; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5
-; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4
-; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX7-NEXT: v_add_f32_e32 v6, v6, v2
-; GFX7-NEXT: v_add_f32_e32 v7, v7, v3
-; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6
-; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7
-; GFX7-NEXT: v_or_b32_e32 v6, v4, v5
-; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8
-; GFX7-NEXT: v_or_b32_e32 v5, v7, v4
-; GFX7-NEXT: v_mov_b32_e32 v8, v6
-; GFX7-NEXT: v_mov_b32_e32 v7, v5
-; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 offset:2044 glc
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_lshrrev_b32_e32 v2, v6, v3
+; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GFX7-NEXT: v_and_b32_e32 v4, v3, v8
+; GFX7-NEXT: v_add_f32_e32 v2, v2, v7
+; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, v6, v2
+; GFX7-NEXT: v_or_b32_e32 v2, v4, v2
+; GFX7-NEXT: v_mov_b32_e32 v5, v3
+; GFX7-NEXT: v_mov_b32_e32 v4, v2
+; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7
-; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7
-; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT: v_mov_b32_e32 v3, v4
; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_cbranch_execnz .LBB46_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, v6, v4
+; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
-; GFX6-LABEL: global_agent_atomic_fadd_noret_v2f16__offset12b_pos:
+; GFX6-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0xfffff800, v0
; GFX6-NEXT: s_mov_b32 s6, 0
+; GFX6-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc
+; GFX6-NEXT: v_and_b32_e32 v0, -4, v4
; GFX6-NEXT: s_mov_b32 s7, 0xf000
; GFX6-NEXT: s_mov_b32 s4, s6
; GFX6-NEXT: s_mov_b32 s5, s6
-; GFX6-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:2044
-; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3
-; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v2
+; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
+; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX6-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX6-NEXT: v_lshlrev_b32_e32 v6, 3, v4
; GFX6-NEXT: s_mov_b64 s[8:9], 0
-; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v3
-; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v5
-; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v5
-; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v3
-; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6
+; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v2
+; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v6
+; GFX6-NEXT: v_not_b32_e32 v8, v2
; GFX6-NEXT: .LBB46_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5
-; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4
-; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v5
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_lshrrev_b32_e32 v2, v6, v3
+; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v4
-; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX6-NEXT: v_add_f32_e32 v6, v6, v2
-; GFX6-NEXT: v_add_f32_e32 v7, v7, v3
-; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v6
-; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v7
-; GFX6-NEXT: v_or_b32_e32 v6, v4, v5
-; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v8
-; GFX6-NEXT: v_or_b32_e32 v5, v7, v4
-; GFX6-NEXT: v_mov_b32_e32 v8, v6
-; GFX6-NEXT: v_mov_b32_e32 v7, v5
-; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 offset:2044 glc
+; GFX6-NEXT: v_and_b32_e32 v4, v3, v8
+; GFX6-NEXT: v_add_f32_e32 v2, v2, v7
+; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX6-NEXT: v_lshlrev_b32_e32 v2, v6, v2
+; GFX6-NEXT: v_or_b32_e32 v2, v4, v2
+; GFX6-NEXT: v_mov_b32_e32 v5, v3
+; GFX6-NEXT: v_mov_b32_e32 v4, v2
+; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v7
-; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v7
-; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT: v_mov_b32_e32 v3, v4
; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_cbranch_execnz .LBB46_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
+; GFX6-NEXT: v_lshrrev_b32_e32 v0, v6, v4
+; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
- %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i64 511
- %unused = atomicrmw fadd ptr addrspace(1) %gep, <2 x half> %val syncscope("agent") seq_cst
- ret void
-}
+ %gep = getelementptr half, ptr addrspace(1) %ptr, i64 -1024
+ %result = atomicrmw fadd ptr addrspace(1) %gep, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+ ret half %result
+ }
-define void @global_agent_atomic_fadd_noret_v2f16__offset12b_neg(ptr addrspace(1) %ptr, <2 x half> %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fadd_noret_v2f16__offset12b_neg:
+define void @global_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: s_wait_expcnt 0x0
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_mov_b32_e32 v3, v0
+; GFX12-NEXT: s_mov_b32 s0, 0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX12-NEXT: global_load_b32 v4, v[0:1], off
+; GFX12-NEXT: v_lshlrev_b32_e32 v5, 3, v3
+; GFX12-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_not_b32_e32 v6, v3
+; GFX12-NEXT: .LBB47_1: ; %atomicrmw.start
+; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: v_lshrrev_b32_e32 v3, v5, v4
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_add_f16_e32 v3, v3, v2
+; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3
+; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off offset:-2048
-; GFX12-NEXT: s_wait_storecnt 0x0
+; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN
+; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_cbranch_execnz .LBB47_1
+; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
-; GFX940-LABEL: global_agent_atomic_fadd_noret_v2f16__offset12b_neg:
+; GFX940-LABEL: global_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT: v_mov_b32_e32 v3, v0
+; GFX940-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX940-NEXT: global_load_dword v5, v[0:1], off
+; GFX940-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX940-NEXT: s_mov_b32 s0, 0xffff
+; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0
+; GFX940-NEXT: v_not_b32_e32 v6, v4
+; GFX940-NEXT: s_mov_b64 s[0:1], 0
+; GFX940-NEXT: .LBB47_1: ; %atomicrmw.start
+; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX940-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NEXT: v_lshrrev_b32_e32 v4, v3, v5
+; GFX940-NEXT: v_add_f16_e32 v4, v4, v2
+; GFX940-NEXT: v_lshlrev_b32_e32 v4, v3, v4
+; GFX940-NEXT: v_and_or_b32 v4, v5, v6, v4
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off offset:-2048
+; GFX940-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off sc0
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc1
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5
+; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX940-NEXT: v_mov_b32_e32 v5, v4
+; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX940-NEXT: s_cbranch_execnz .LBB47_1
+; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX940-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: global_agent_atomic_fadd_noret_v2f16__offset12b_neg:
+; GFX11-LABEL: global_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v4, v[0:1], off offset:-2048
+; GFX11-NEXT: v_mov_b32_e32 v3, v0
; GFX11-NEXT: s_mov_b32 s0, 0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX11-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX11-NEXT: global_load_b32 v4, v[0:1], off
+; GFX11-NEXT: v_lshlrev_b32_e32 v5, 3, v3
+; GFX11-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_not_b32_e32 v6, v3
; GFX11-NEXT: .LBB47_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_pk_add_f16 v3, v4, v2
+; GFX11-NEXT: v_lshrrev_b32_e32 v3, v5, v4
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_add_f16_e32 v3, v3, v2
+; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_lshlrev_b32_e32 v3, v5, v3
+; GFX11-NEXT: v_and_or_b32 v3, v4, v6, v3
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:-2048 glc
+; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
@@ -12389,17 +8885,26 @@ define void @global_agent_atomic_fadd_noret_v2f16__offset12b_neg(ptr addrspace(1
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
-; GFX10-LABEL: global_agent_atomic_fadd_noret_v2f16__offset12b_neg:
+; GFX10-LABEL: global_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:-2048
+; GFX10-NEXT: v_mov_b32_e32 v3, v0
; GFX10-NEXT: s_mov_b32 s4, 0
+; GFX10-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX10-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX10-NEXT: global_load_dword v4, v[0:1], off
+; GFX10-NEXT: v_lshlrev_b32_e32 v5, 3, v3
+; GFX10-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX10-NEXT: v_not_b32_e32 v6, v3
; GFX10-NEXT: .LBB47_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_pk_add_f16 v3, v4, v2
+; GFX10-NEXT: v_lshrrev_b32_e32 v3, v5, v4
+; GFX10-NEXT: v_add_f16_e32 v3, v3, v2
+; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX10-NEXT: v_and_or_b32 v3, v4, v6, v3
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc
+; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
@@ -12412,24 +8917,57 @@ define void @global_agent_atomic_fadd_noret_v2f16__offset12b_neg(ptr addrspace(1
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX90A-LABEL: global_agent_atomic_fadd_noret_v2f16__offset12b_neg:
+; GFX90A-LABEL: global_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off offset:-2048
-; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX908-LABEL: global_agent_atomic_fadd_noret_v2f16__offset12b_neg:
-; GFX908: ; %bb.0:
-; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: global_load_dword v4, v[0:1], off offset:-2048
+; GFX90A-NEXT: v_mov_b32_e32 v3, v0
+; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX90A-NEXT: global_load_dword v5, v[0:1], off
+; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX90A-NEXT: s_mov_b32 s4, 0xffff
+; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4
+; GFX90A-NEXT: v_not_b32_e32 v6, v4
+; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: .LBB47_1: ; %atomicrmw.start
+; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v5
+; GFX90A-NEXT: v_add_f16_e32 v4, v4, v2
+; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4
+; GFX90A-NEXT: v_and_or_b32 v4, v5, v6, v4
+; GFX90A-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off glc
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: buffer_wbinvl1
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5
+; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v5, v4
+; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: s_cbranch_execnz .LBB47_1
+; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: global_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory:
+; GFX908: ; %bb.0:
+; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT: v_mov_b32_e32 v3, v0
+; GFX908-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX908-NEXT: global_load_dword v4, v[0:1], off
+; GFX908-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX908-NEXT: v_lshlrev_b32_e32 v5, 3, v3
+; GFX908-NEXT: s_mov_b32 s4, 0xffff
+; GFX908-NEXT: v_lshlrev_b32_e64 v3, v5, s4
+; GFX908-NEXT: v_not_b32_e32 v6, v3
; GFX908-NEXT: s_mov_b64 s[4:5], 0
; GFX908-NEXT: .LBB47_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_pk_add_f16 v3, v4, v2
-; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc
+; GFX908-NEXT: v_lshrrev_b32_e32 v3, v5, v4
+; GFX908-NEXT: v_add_f16_e32 v3, v3, v2
+; GFX908-NEXT: v_lshlrev_b32_e32 v3, v5, v3
+; GFX908-NEXT: v_and_or_b32 v3, v4, v6, v3
+; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
@@ -12441,19 +8979,26 @@ define void @global_agent_atomic_fadd_noret_v2f16__offset12b_neg(ptr addrspace(1
; GFX908-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
-; GFX8-LABEL: global_agent_atomic_fadd_noret_v2f16__offset12b_neg:
+; GFX8-LABEL: global_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0xfffff800, v0
-; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc
+; GFX8-NEXT: v_mov_b32_e32 v3, v0
+; GFX8-NEXT: v_and_b32_e32 v0, -4, v3
; GFX8-NEXT: flat_load_dword v4, v[0:1]
+; GFX8-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX8-NEXT: v_lshlrev_b32_e32 v5, 3, v3
+; GFX8-NEXT: s_mov_b32 s4, 0xffff
+; GFX8-NEXT: v_lshlrev_b32_e64 v3, v5, s4
+; GFX8-NEXT: v_not_b32_e32 v6, v3
; GFX8-NEXT: s_mov_b64 s[4:5], 0
; GFX8-NEXT: .LBB47_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_add_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_add_f16_e32 v5, v4, v2
-; GFX8-NEXT: v_or_b32_e32 v3, v5, v3
+; GFX8-NEXT: v_lshrrev_b32_e32 v3, v5, v4
+; GFX8-NEXT: v_add_f16_e32 v3, v3, v2
+; GFX8-NEXT: v_and_b32_e32 v7, v4, v6
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, v5, v3
+; GFX8-NEXT: v_or_b32_e32 v3, v7, v3
; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
@@ -12466,382 +9011,533 @@ define void @global_agent_atomic_fadd_noret_v2f16__offset12b_neg(ptr addrspace(1
; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX7-LABEL: global_agent_atomic_fadd_noret_v2f16__offset12b_neg:
+; GFX7-LABEL: global_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: s_movk_i32 s4, 0xf800
-; GFX7-NEXT: s_mov_b32 s5, -1
-; GFX7-NEXT: s_mov_b32 s7, 0xf000
+; GFX7-NEXT: v_mov_b32_e32 v3, v0
; GFX7-NEXT: s_mov_b32 s6, 0
-; GFX7-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3
-; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2
-; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0
-; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v3
-; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc
-; GFX7-NEXT: s_mov_b64 s[8:9], 0
+; GFX7-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v5
-; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v5
-; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3
-; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6
+; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v2
+; GFX7-NEXT: v_and_b32_e32 v2, 3, v3
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2
+; GFX7-NEXT: v_lshl_b32_e32 v3, 0xffff, v2
+; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5
+; GFX7-NEXT: v_not_b32_e32 v6, v3
+; GFX7-NEXT: s_mov_b64 s[8:9], 0
; GFX7-NEXT: .LBB47_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5
-; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4
-; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5
-; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4
-; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX7-NEXT: v_add_f32_e32 v6, v6, v2
-; GFX7-NEXT: v_add_f32_e32 v7, v7, v3
-; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6
-; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7
-; GFX7-NEXT: v_or_b32_e32 v6, v4, v5
-; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8
-; GFX7-NEXT: v_or_b32_e32 v5, v7, v4
-; GFX7-NEXT: v_mov_b32_e32 v8, v6
-; GFX7-NEXT: v_mov_b32_e32 v7, v5
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, v2, v4
+; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3
+; GFX7-NEXT: v_and_b32_e32 v7, v4, v6
+; GFX7-NEXT: v_add_f32_e32 v3, v3, v5
+; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, v2, v3
+; GFX7-NEXT: v_or_b32_e32 v3, v7, v3
+; GFX7-NEXT: v_mov_b32_e32 v8, v4
+; GFX7-NEXT: v_mov_b32_e32 v7, v3
; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7
-; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7
-; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v4
; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT: v_mov_b32_e32 v4, v7
; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_cbranch_execnz .LBB47_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
-; GFX6-LABEL: global_agent_atomic_fadd_noret_v2f16__offset12b_neg:
+; GFX6-LABEL: global_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT: s_movk_i32 s4, 0xf800
-; GFX6-NEXT: s_mov_b32 s5, -1
-; GFX6-NEXT: s_mov_b32 s7, 0xf000
+; GFX6-NEXT: v_mov_b32_e32 v3, v0
; GFX6-NEXT: s_mov_b32 s6, 0
-; GFX6-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64
-; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3
-; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v2
-; GFX6-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0
-; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v3
-; GFX6-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc
-; GFX6-NEXT: s_mov_b64 s[8:9], 0
+; GFX6-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX6-NEXT: s_mov_b32 s7, 0xf000
; GFX6-NEXT: s_mov_b32 s4, s6
; GFX6-NEXT: s_mov_b32 s5, s6
-; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v5
-; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v5
-; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v3
-; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6
+; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64
+; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v2
+; GFX6-NEXT: v_and_b32_e32 v2, 3, v3
+; GFX6-NEXT: v_lshlrev_b32_e32 v2, 3, v2
+; GFX6-NEXT: v_lshl_b32_e32 v3, 0xffff, v2
+; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5
+; GFX6-NEXT: v_not_b32_e32 v6, v3
+; GFX6-NEXT: s_mov_b64 s[8:9], 0
; GFX6-NEXT: .LBB47_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5
-; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4
-; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v5
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_lshrrev_b32_e32 v3, v2, v4
+; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v4
-; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX6-NEXT: v_add_f32_e32 v6, v6, v2
-; GFX6-NEXT: v_add_f32_e32 v7, v7, v3
-; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v6
-; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v7
-; GFX6-NEXT: v_or_b32_e32 v6, v4, v5
-; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v8
-; GFX6-NEXT: v_or_b32_e32 v5, v7, v4
-; GFX6-NEXT: v_mov_b32_e32 v8, v6
-; GFX6-NEXT: v_mov_b32_e32 v7, v5
+; GFX6-NEXT: v_and_b32_e32 v7, v4, v6
+; GFX6-NEXT: v_add_f32_e32 v3, v3, v5
+; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX6-NEXT: v_lshlrev_b32_e32 v3, v2, v3
+; GFX6-NEXT: v_or_b32_e32 v3, v7, v3
+; GFX6-NEXT: v_mov_b32_e32 v8, v4
+; GFX6-NEXT: v_mov_b32_e32 v7, v3
; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v7
-; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v7
-; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v4
; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT: v_mov_b32_e32 v4, v7
; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_cbranch_execnz .LBB47_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
- %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i64 -512
- %unused = atomicrmw fadd ptr addrspace(1) %gep, <2 x half> %val syncscope("agent") seq_cst
+ %unused = atomicrmw fadd ptr addrspace(1) %ptr, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
ret void
}
-define <2 x half> @global_system_atomic_fadd_ret_v2f16__offset12b_pos(ptr addrspace(1) %ptr, <2 x half> %val) #0 {
-; GFX12-LABEL: global_system_atomic_fadd_ret_v2f16__offset12b_pos:
+define void @global_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: s_wait_expcnt 0x0
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX12-NEXT: s_mov_b32 s0, 0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX12-NEXT: global_load_b32 v4, v[0:1], off
+; GFX12-NEXT: v_lshlrev_b32_e32 v5, 3, v3
+; GFX12-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_not_b32_e32 v6, v3
+; GFX12-NEXT: .LBB48_1: ; %atomicrmw.start
+; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: v_lshrrev_b32_e32 v3, v5, v4
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_add_f16_e32 v3, v3, v2
+; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3
+; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN
+; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_SYS
+; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_cbranch_execnz .LBB48_1
+; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
-; GFX940-LABEL: global_system_atomic_fadd_ret_v2f16__offset12b_pos:
+; GFX940-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: buffer_wbl2 sc0 sc1
-; GFX940-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off offset:2044 sc0 sc1
+; GFX940-NEXT: s_mov_b64 s[0:1], 0x7fe
+; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
+; GFX940-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX940-NEXT: v_mov_b32_e32 v1, v5
+; GFX940-NEXT: global_load_dword v5, v[0:1], off
+; GFX940-NEXT: v_and_b32_e32 v3, 3, v4
+; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX940-NEXT: s_mov_b32 s0, 0xffff
+; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0
+; GFX940-NEXT: v_not_b32_e32 v6, v4
+; GFX940-NEXT: s_mov_b64 s[0:1], 0
+; GFX940-NEXT: .LBB48_1: ; %atomicrmw.start
+; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: buffer_inv sc0 sc1
+; GFX940-NEXT: v_lshrrev_b32_e32 v4, v3, v5
+; GFX940-NEXT: v_add_f16_e32 v4, v4, v2
+; GFX940-NEXT: v_lshlrev_b32_e32 v4, v3, v4
+; GFX940-NEXT: v_and_or_b32 v4, v5, v6, v4
+; GFX940-NEXT: buffer_wbl2 sc1
+; GFX940-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off sc0
+; GFX940-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NEXT: buffer_inv sc1
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5
+; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX940-NEXT: v_mov_b32_e32 v5, v4
+; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX940-NEXT: s_cbranch_execnz .LBB48_1
+; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX940-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: global_system_atomic_fadd_ret_v2f16__offset12b_pos:
+; GFX11-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044
+; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX11-NEXT: s_mov_b32 s0, 0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX11-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX11-NEXT: global_load_b32 v4, v[0:1], off
+; GFX11-NEXT: v_lshlrev_b32_e32 v5, 3, v3
+; GFX11-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_not_b32_e32 v6, v3
; GFX11-NEXT: .LBB48_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v4, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_pk_add_f16 v3, v4, v2
+; GFX11-NEXT: v_lshrrev_b32_e32 v3, v5, v4
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_add_f16_e32 v3, v3, v2
+; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_lshlrev_b32_e32 v3, v5, v3
+; GFX11-NEXT: v_and_or_b32 v3, v4, v6, v3
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 glc
+; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-NEXT: v_mov_b32_e32 v4, v3
; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: s_cbranch_execnz .LBB48_1
; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
-; GFX11-NEXT: v_mov_b32_e32 v0, v3
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
-; GFX10-LABEL: global_system_atomic_fadd_ret_v2f16__offset12b_pos:
+; GFX10-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044
+; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX10-NEXT: s_mov_b32 s4, 0
+; GFX10-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX10-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX10-NEXT: global_load_dword v4, v[0:1], off
+; GFX10-NEXT: v_lshlrev_b32_e32 v5, 3, v3
+; GFX10-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX10-NEXT: v_not_b32_e32 v6, v3
; GFX10-NEXT: .LBB48_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v4, v3
-; GFX10-NEXT: v_pk_add_f16 v3, v4, v2
+; GFX10-NEXT: v_lshrrev_b32_e32 v3, v5, v4
+; GFX10-NEXT: v_add_f16_e32 v3, v3, v2
+; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX10-NEXT: v_and_or_b32 v3, v4, v6, v3
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
+; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX10-NEXT: v_mov_b32_e32 v4, v3
; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_cbranch_execnz .LBB48_1
; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
-; GFX10-NEXT: v_mov_b32_e32 v0, v3
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX90A-LABEL: global_system_atomic_fadd_ret_v2f16__offset12b_pos:
+; GFX90A-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: buffer_wbl2
-; GFX90A-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off offset:2044 glc
+; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0
+; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX90A-NEXT: global_load_dword v5, v[0:1], off
+; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX90A-NEXT: s_mov_b32 s4, 0xffff
+; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4
+; GFX90A-NEXT: v_not_b32_e32 v6, v4
+; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: .LBB48_1: ; %atomicrmw.start
+; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v5
+; GFX90A-NEXT: v_add_f16_e32 v4, v4, v2
+; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4
+; GFX90A-NEXT: v_and_or_b32 v4, v5, v6, v4
+; GFX90A-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: buffer_invl2
; GFX90A-NEXT: buffer_wbinvl1
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5
+; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v5, v4
+; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: s_cbranch_execnz .LBB48_1
+; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
-; GFX908-LABEL: global_system_atomic_fadd_ret_v2f16__offset12b_pos:
+; GFX908-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044
+; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0
+; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX908-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX908-NEXT: global_load_dword v4, v[0:1], off
+; GFX908-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX908-NEXT: v_lshlrev_b32_e32 v5, 3, v3
+; GFX908-NEXT: s_mov_b32 s4, 0xffff
+; GFX908-NEXT: v_lshlrev_b32_e64 v3, v5, s4
+; GFX908-NEXT: v_not_b32_e32 v6, v3
; GFX908-NEXT: s_mov_b64 s[4:5], 0
; GFX908-NEXT: .LBB48_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v4, v3
-; GFX908-NEXT: v_pk_add_f16 v3, v4, v2
-; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
+; GFX908-NEXT: v_lshrrev_b32_e32 v3, v5, v4
+; GFX908-NEXT: v_add_f16_e32 v3, v3, v2
+; GFX908-NEXT: v_lshlrev_b32_e32 v3, v5, v3
+; GFX908-NEXT: v_and_or_b32 v3, v4, v6, v3
+; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT: v_mov_b32_e32 v4, v3
; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_cbranch_execnz .LBB48_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX908-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX908-NEXT: v_mov_b32_e32 v0, v3
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
-; GFX8-LABEL: global_system_atomic_fadd_ret_v2f16__offset12b_pos:
+; GFX8-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fc, v0
-; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_load_dword v0, v[3:4]
+; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fe, v0
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX8-NEXT: flat_load_dword v4, v[0:1]
+; GFX8-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX8-NEXT: v_lshlrev_b32_e32 v5, 3, v3
+; GFX8-NEXT: s_mov_b32 s4, 0xffff
+; GFX8-NEXT: v_lshlrev_b32_e64 v3, v5, s4
+; GFX8-NEXT: v_not_b32_e32 v6, v3
; GFX8-NEXT: s_mov_b64 s[4:5], 0
; GFX8-NEXT: .LBB48_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v1, v0
-; GFX8-NEXT: v_add_f16_sdwa v0, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_add_f16_e32 v5, v1, v2
-; GFX8-NEXT: v_or_b32_e32 v0, v5, v0
-; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
+; GFX8-NEXT: v_lshrrev_b32_e32 v3, v5, v4
+; GFX8-NEXT: v_add_f16_e32 v3, v3, v2
+; GFX8-NEXT: v_and_b32_e32 v7, v4, v6
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, v5, v3
+; GFX8-NEXT: v_or_b32_e32 v3, v7, v3
+; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT: v_mov_b32_e32 v4, v3
; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_cbranch_execnz .LBB48_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX7-LABEL: global_system_atomic_fadd_ret_v2f16__offset12b_pos:
+; GFX7-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0
; GFX7-NEXT: s_mov_b32 s6, 0
+; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX7-NEXT: v_and_b32_e32 v0, -4, v4
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
-; GFX7-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:2044
-; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3
-; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2
+; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4
; GFX7-NEXT: s_mov_b64 s[8:9], 0
-; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v5
-; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v5
-; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3
-; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v6
+; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2
+; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v4
+; GFX7-NEXT: v_not_b32_e32 v6, v2
; GFX7-NEXT: .LBB48_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GFX7-NEXT: v_and_b32_e32 v7, v3, v6
+; GFX7-NEXT: v_add_f32_e32 v2, v2, v5
; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v3
-; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v2
-; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX7-NEXT: v_add_f32_e32 v6, v6, v4
-; GFX7-NEXT: v_add_f32_e32 v7, v7, v5
-; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6
-; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7
-; GFX7-NEXT: v_or_b32_e32 v7, v2, v3
-; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v6
-; GFX7-NEXT: v_or_b32_e32 v6, v8, v2
-; GFX7-NEXT: v_mov_b32_e32 v9, v7
-; GFX7-NEXT: v_mov_b32_e32 v8, v6
-; GFX7-NEXT: buffer_atomic_cmpswap v[8:9], v[0:1], s[4:7], 0 addr64 offset:2044 glc
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, v4, v2
+; GFX7-NEXT: v_or_b32_e32 v2, v7, v2
+; GFX7-NEXT: v_mov_b32_e32 v8, v3
+; GFX7-NEXT: v_mov_b32_e32 v7, v2
+; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v8
-; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v8
-; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3
; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT: v_mov_b32_e32 v3, v7
; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_cbranch_execnz .LBB48_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
-; GFX7-NEXT: v_mov_b32_e32 v0, v2
-; GFX7-NEXT: v_mov_b32_e32 v1, v3
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
-; GFX6-LABEL: global_system_atomic_fadd_ret_v2f16__offset12b_pos:
+; GFX6-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0
; GFX6-NEXT: s_mov_b32 s6, 0
+; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX6-NEXT: v_and_b32_e32 v0, -4, v4
; GFX6-NEXT: s_mov_b32 s7, 0xf000
; GFX6-NEXT: s_mov_b32 s4, s6
; GFX6-NEXT: s_mov_b32 s5, s6
-; GFX6-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:2044
-; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3
-; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v2
+; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
+; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX6-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX6-NEXT: v_lshlrev_b32_e32 v4, 3, v4
; GFX6-NEXT: s_mov_b64 s[8:9], 0
-; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v3
-; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v5
-; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v5
-; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3
-; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v6
+; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v2
+; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v4
+; GFX6-NEXT: v_not_b32_e32 v6, v2
; GFX6-NEXT: .LBB48_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3
-; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v3
-; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v2
-; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX6-NEXT: v_add_f32_e32 v6, v6, v4
-; GFX6-NEXT: v_add_f32_e32 v7, v7, v5
-; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v7
-; GFX6-NEXT: v_or_b32_e32 v7, v2, v3
-; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v6
-; GFX6-NEXT: v_or_b32_e32 v6, v8, v2
-; GFX6-NEXT: v_mov_b32_e32 v9, v7
-; GFX6-NEXT: v_mov_b32_e32 v8, v6
-; GFX6-NEXT: buffer_atomic_cmpswap v[8:9], v[0:1], s[4:7], 0 addr64 offset:2044 glc
+; GFX6-NEXT: v_and_b32_e32 v7, v3, v6
+; GFX6-NEXT: v_add_f32_e32 v2, v2, v5
+; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX6-NEXT: v_lshlrev_b32_e32 v2, v4, v2
+; GFX6-NEXT: v_or_b32_e32 v2, v7, v2
+; GFX6-NEXT: v_mov_b32_e32 v8, v3
+; GFX6-NEXT: v_mov_b32_e32 v7, v2
+; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v8
-; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v8
-; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3
; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT: v_mov_b32_e32 v3, v7
; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_cbranch_execnz .LBB48_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
-; GFX6-NEXT: v_mov_b32_e32 v0, v2
-; GFX6-NEXT: v_mov_b32_e32 v1, v3
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
- %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i64 511
- %result = atomicrmw fadd ptr addrspace(1) %gep, <2 x half> %val seq_cst
- ret <2 x half> %result
+ %gep = getelementptr half, ptr addrspace(1) %ptr, i64 1023
+ %unused = atomicrmw fadd ptr addrspace(1) %gep, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+ ret void
}
-define void @global_system_atomic_fadd_noret_v2f16__offset12b_pos(ptr addrspace(1) %ptr, <2 x half> %val) #0 {
-; GFX12-LABEL: global_system_atomic_fadd_noret_v2f16__offset12b_pos:
+define void @global_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: s_wait_expcnt 0x0
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
+; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX12-NEXT: s_mov_b32 s0, 0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX12-NEXT: global_load_b32 v4, v[0:1], off
+; GFX12-NEXT: v_lshlrev_b32_e32 v5, 3, v3
+; GFX12-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_not_b32_e32 v6, v3
+; GFX12-NEXT: .LBB49_1: ; %atomicrmw.start
+; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: v_lshrrev_b32_e32 v3, v5, v4
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_add_f16_e32 v3, v3, v2
+; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3
+; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off offset:2044
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_SYS
+; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_cbranch_execnz .LBB49_1
+; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
-; GFX940-LABEL: global_system_atomic_fadd_noret_v2f16__offset12b_pos:
+; GFX940-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: buffer_wbl2 sc0 sc1
-; GFX940-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off offset:2044 sc1
+; GFX940-NEXT: s_movk_i32 s0, 0xf800
+; GFX940-NEXT: s_mov_b32 s1, -1
+; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
+; GFX940-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX940-NEXT: v_mov_b32_e32 v1, v5
+; GFX940-NEXT: global_load_dword v5, v[0:1], off
+; GFX940-NEXT: v_and_b32_e32 v3, 3, v4
+; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX940-NEXT: s_mov_b32 s0, 0xffff
+; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0
+; GFX940-NEXT: v_not_b32_e32 v6, v4
+; GFX940-NEXT: s_mov_b64 s[0:1], 0
+; GFX940-NEXT: .LBB49_1: ; %atomicrmw.start
+; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: buffer_inv sc0 sc1
-; GFX940-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: global_system_atomic_fadd_noret_v2f16__offset12b_pos:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v4, v[0:1], off offset:2044
+; GFX940-NEXT: v_lshrrev_b32_e32 v4, v3, v5
+; GFX940-NEXT: v_add_f16_e32 v4, v4, v2
+; GFX940-NEXT: v_lshlrev_b32_e32 v4, v3, v4
+; GFX940-NEXT: v_and_or_b32 v4, v5, v6, v4
+; GFX940-NEXT: buffer_wbl2 sc1
+; GFX940-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off sc0
+; GFX940-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NEXT: buffer_inv sc1
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5
+; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX940-NEXT: v_mov_b32_e32 v5, v4
+; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX940-NEXT: s_cbranch_execnz .LBB49_1
+; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX940-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX940-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
+; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
; GFX11-NEXT: s_mov_b32 s0, 0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX11-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX11-NEXT: global_load_b32 v4, v[0:1], off
+; GFX11-NEXT: v_lshlrev_b32_e32 v5, 3, v3
+; GFX11-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_not_b32_e32 v6, v3
; GFX11-NEXT: .LBB49_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_pk_add_f16 v3, v4, v2
+; GFX11-NEXT: v_lshrrev_b32_e32 v3, v5, v4
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_add_f16_e32 v3, v3, v2
+; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_lshlrev_b32_e32 v3, v5, v3
+; GFX11-NEXT: v_and_or_b32 v3, v4, v6, v3
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 glc
+; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
@@ -12855,17 +9551,27 @@ define void @global_system_atomic_fadd_noret_v2f16__offset12b_pos(ptr addrspace(
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
-; GFX10-LABEL: global_system_atomic_fadd_noret_v2f16__offset12b_pos:
+; GFX10-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:2044
+; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
+; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
; GFX10-NEXT: s_mov_b32 s4, 0
+; GFX10-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX10-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX10-NEXT: global_load_dword v4, v[0:1], off
+; GFX10-NEXT: v_lshlrev_b32_e32 v5, 3, v3
+; GFX10-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX10-NEXT: v_not_b32_e32 v6, v3
; GFX10-NEXT: .LBB49_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_pk_add_f16 v3, v4, v2
+; GFX10-NEXT: v_lshrrev_b32_e32 v3, v5, v4
+; GFX10-NEXT: v_add_f16_e32 v3, v3, v2
+; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX10-NEXT: v_and_or_b32 v3, v4, v6, v3
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
+; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
@@ -12878,26 +9584,59 @@ define void @global_system_atomic_fadd_noret_v2f16__offset12b_pos(ptr addrspace(
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX90A-LABEL: global_system_atomic_fadd_noret_v2f16__offset12b_pos:
+; GFX90A-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: buffer_wbl2
-; GFX90A-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off offset:2044
+; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
+; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX90A-NEXT: global_load_dword v5, v[0:1], off
+; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX90A-NEXT: s_mov_b32 s4, 0xffff
+; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4
+; GFX90A-NEXT: v_not_b32_e32 v6, v4
+; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: .LBB49_1: ; %atomicrmw.start
+; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v5
+; GFX90A-NEXT: v_add_f16_e32 v4, v4, v2
+; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4
+; GFX90A-NEXT: v_and_or_b32 v4, v5, v6, v4
+; GFX90A-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: buffer_invl2
; GFX90A-NEXT: buffer_wbinvl1
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5
+; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v5, v4
+; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: s_cbranch_execnz .LBB49_1
+; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
-; GFX908-LABEL: global_system_atomic_fadd_noret_v2f16__offset12b_pos:
+; GFX908-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: global_load_dword v4, v[0:1], off offset:2044
+; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
+; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX908-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX908-NEXT: global_load_dword v4, v[0:1], off
+; GFX908-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX908-NEXT: v_lshlrev_b32_e32 v5, 3, v3
+; GFX908-NEXT: s_mov_b32 s4, 0xffff
+; GFX908-NEXT: v_lshlrev_b32_e64 v3, v5, s4
+; GFX908-NEXT: v_not_b32_e32 v6, v3
; GFX908-NEXT: s_mov_b64 s[4:5], 0
; GFX908-NEXT: .LBB49_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_pk_add_f16 v3, v4, v2
-; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
+; GFX908-NEXT: v_lshrrev_b32_e32 v3, v5, v4
+; GFX908-NEXT: v_add_f16_e32 v3, v3, v2
+; GFX908-NEXT: v_lshlrev_b32_e32 v3, v5, v3
+; GFX908-NEXT: v_and_or_b32 v3, v4, v6, v3
+; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
@@ -12909,137 +9648,10283 @@ define void @global_system_atomic_fadd_noret_v2f16__offset12b_pos(ptr addrspace(
; GFX908-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
-; GFX8-LABEL: global_system_atomic_fadd_noret_v2f16__offset12b_pos:
+; GFX8-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0
-; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0xfffff800, v0
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc
+; GFX8-NEXT: v_and_b32_e32 v0, -4, v3
; GFX8-NEXT: flat_load_dword v4, v[0:1]
+; GFX8-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX8-NEXT: v_lshlrev_b32_e32 v5, 3, v3
+; GFX8-NEXT: s_mov_b32 s4, 0xffff
+; GFX8-NEXT: v_lshlrev_b32_e64 v3, v5, s4
+; GFX8-NEXT: v_not_b32_e32 v6, v3
; GFX8-NEXT: s_mov_b64 s[4:5], 0
; GFX8-NEXT: .LBB49_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_add_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; GFX8-NEXT: v_add_f16_e32 v5, v4, v2
-; GFX8-NEXT: v_or_b32_e32 v3, v5, v3
-; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT: v_lshrrev_b32_e32 v3, v5, v4
+; GFX8-NEXT: v_add_f16_e32 v3, v3, v2
+; GFX8-NEXT: v_and_b32_e32 v7, v4, v6
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, v5, v3
+; GFX8-NEXT: v_or_b32_e32 v3, v7, v3
+; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: buffer_wbinvl1
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT: v_mov_b32_e32 v4, v3
+; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX8-NEXT: s_cbranch_execnz .LBB49_1
+; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0xfffff800, v0
+; GFX7-NEXT: s_mov_b32 s6, 0
+; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc
+; GFX7-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX7-NEXT: s_mov_b32 s7, 0xf000
+; GFX7-NEXT: s_mov_b32 s4, s6
+; GFX7-NEXT: s_mov_b32 s5, s6
+; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX7-NEXT: s_mov_b64 s[8:9], 0
+; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2
+; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v4
+; GFX7-NEXT: v_not_b32_e32 v6, v2
+; GFX7-NEXT: .LBB49_1: ; %atomicrmw.start
+; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GFX7-NEXT: v_and_b32_e32 v7, v3, v6
+; GFX7-NEXT: v_add_f32_e32 v2, v2, v5
+; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, v4, v2
+; GFX7-NEXT: v_or_b32_e32 v2, v7, v2
+; GFX7-NEXT: v_mov_b32_e32 v8, v3
+; GFX7-NEXT: v_mov_b32_e32 v7, v2
+; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: buffer_wbinvl1
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3
+; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT: v_mov_b32_e32 v3, v7
+; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
+; GFX7-NEXT: s_cbranch_execnz .LBB49_1
+; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX6-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0xfffff800, v0
+; GFX6-NEXT: s_mov_b32 s6, 0
+; GFX6-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc
+; GFX6-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX6-NEXT: s_mov_b32 s7, 0xf000
+; GFX6-NEXT: s_mov_b32 s4, s6
+; GFX6-NEXT: s_mov_b32 s5, s6
+; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
+; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX6-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX6-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX6-NEXT: s_mov_b64 s[8:9], 0
+; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v2
+; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v4
+; GFX6-NEXT: v_not_b32_e32 v6, v2
+; GFX6-NEXT: .LBB49_1: ; %atomicrmw.start
+; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_and_b32_e32 v7, v3, v6
+; GFX6-NEXT: v_add_f32_e32 v2, v2, v5
+; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX6-NEXT: v_lshlrev_b32_e32 v2, v4, v2
+; GFX6-NEXT: v_or_b32_e32 v2, v7, v2
+; GFX6-NEXT: v_mov_b32_e32 v8, v3
+; GFX6-NEXT: v_mov_b32_e32 v7, v2
+; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: buffer_wbinvl1
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3
+; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT: v_mov_b32_e32 v3, v7
+; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
+; GFX6-NEXT: s_cbranch_execnz .LBB49_1
+; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: s_setpc_b64 s[30:31]
+ %gep = getelementptr half, ptr addrspace(1) %ptr, i64 -1024
+ %unused = atomicrmw fadd ptr addrspace(1) %gep, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+ ret void
+}
+
+define half @global_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2046
+; GFX12-NEXT: s_mov_b32 s0, 0
+; GFX12-NEXT: .LBB50_1: ; %atomicrmw.start
+; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_add_f16_e32 v3, v4, v2
+; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
+; GFX12-NEXT: s_wait_storecnt 0x0
+; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 th:TH_ATOMIC_RETURN
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_cbranch_execnz .LBB50_1
+; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
+; GFX940: ; %bb.0:
+; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2046
+; GFX940-NEXT: s_mov_b64 s[0:1], 0
+; GFX940-NEXT: s_mov_b32 s2, 0xffff0000
+; GFX940-NEXT: .LBB50_1: ; %atomicrmw.start
+; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX940-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NEXT: v_mov_b32_e32 v5, v3
+; GFX940-NEXT: v_add_f16_e32 v3, v5, v2
+; GFX940-NEXT: v_and_or_b32 v4, v5, s2, v3
+; GFX940-NEXT: buffer_wbl2 sc1
+; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2046 sc0
+; GFX940-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NEXT: buffer_inv sc1
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX940-NEXT: s_cbranch_execnz .LBB50_1
+; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX940-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX940-NEXT: v_mov_b32_e32 v0, v3
+; GFX940-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2046
+; GFX11-NEXT: s_mov_b32 s0, 0
+; GFX11-NEXT: .LBB50_1: ; %atomicrmw.start
+; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_add_f16_e32 v3, v4, v2
+; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 glc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: buffer_gl1_inv
+; GFX11-NEXT: buffer_gl0_inv
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT: s_cbranch_execnz .LBB50_1
+; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT: v_mov_b32_e32 v0, v3
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2046
+; GFX10-NEXT: s_mov_b32 s4, 0
+; GFX10-NEXT: .LBB50_1: ; %atomicrmw.start
+; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_mov_b32_e32 v4, v3
+; GFX10-NEXT: v_add_f16_e32 v3, v4, v2
+; GFX10-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX10-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2046 glc
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: buffer_gl1_inv
+; GFX10-NEXT: buffer_gl0_inv
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT: s_cbranch_execnz .LBB50_1
+; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT: v_mov_b32_e32 v0, v3
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
+; GFX90A: ; %bb.0:
+; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2046
+; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: s_mov_b32 s6, 0xffff0000
+; GFX90A-NEXT: .LBB50_1: ; %atomicrmw.start
+; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v5, v3
+; GFX90A-NEXT: v_add_f16_e32 v3, v5, v2
+; GFX90A-NEXT: v_and_or_b32 v4, v5, s6, v3
+; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2046 glc
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: buffer_wbinvl1
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: s_cbranch_execnz .LBB50_1
+; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v3
+; GFX90A-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
+; GFX908: ; %bb.0:
+; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2046
+; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: s_mov_b32 s6, 0xffff0000
+; GFX908-NEXT: .LBB50_1: ; %atomicrmw.start
+; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: v_mov_b32_e32 v4, v3
+; GFX908-NEXT: v_add_f16_e32 v3, v4, v2
+; GFX908-NEXT: v_and_or_b32 v3, v4, s6, v3
+; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2046 glc
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: buffer_wbinvl1
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX908-NEXT: s_cbranch_execnz .LBB50_1
+; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT: v_mov_b32_e32 v0, v3
+; GFX908-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fe, v0
+; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc
+; GFX8-NEXT: flat_load_dword v0, v[3:4]
+; GFX8-NEXT: s_mov_b64 s[4:5], 0
+; GFX8-NEXT: .LBB50_1: ; %atomicrmw.start
+; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v1, v0
+; GFX8-NEXT: v_add_f16_e32 v0, v1, v2
+; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
+; GFX8-NEXT: v_or_b32_e32 v0, v5, v0
+; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: buffer_wbinvl1
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX8-NEXT: s_cbranch_execnz .LBB50_1
+; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: s_mov_b32 s6, 0
+; GFX7-NEXT: s_mov_b32 s7, 0xf000
+; GFX7-NEXT: s_mov_b32 s4, s6
+; GFX7-NEXT: s_mov_b32 s5, s6
+; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046
+; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT: s_mov_b64 s[8:9], 0
+; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v2
+; GFX7-NEXT: .LBB50_1: ; %atomicrmw.start
+; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v3
+; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v3
+; GFX7-NEXT: v_add_f32_e32 v2, v2, v6
+; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT: v_or_b32_e32 v2, v4, v2
+; GFX7-NEXT: v_mov_b32_e32 v5, v3
+; GFX7-NEXT: v_mov_b32_e32 v4, v2
+; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 offset:2046 glc
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: buffer_wbinvl1
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT: v_mov_b32_e32 v3, v4
+; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
+; GFX7-NEXT: s_cbranch_execnz .LBB50_1
+; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
+; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v4
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX6-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT: s_mov_b32 s6, 0
+; GFX6-NEXT: s_mov_b32 s7, 0xf000
+; GFX6-NEXT: s_mov_b32 s4, s6
+; GFX6-NEXT: s_mov_b32 s5, s6
+; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046
+; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX6-NEXT: s_mov_b64 s[8:9], 0
+; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v2
+; GFX6-NEXT: .LBB50_1: ; %atomicrmw.start
+; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v3
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v3
+; GFX6-NEXT: v_add_f32_e32 v2, v2, v6
+; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX6-NEXT: v_or_b32_e32 v2, v4, v2
+; GFX6-NEXT: v_mov_b32_e32 v5, v3
+; GFX6-NEXT: v_mov_b32_e32 v4, v2
+; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 offset:2046 glc
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: buffer_wbinvl1
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT: v_mov_b32_e32 v3, v4
+; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
+; GFX6-NEXT: s_cbranch_execnz .LBB50_1
+; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
+; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v4
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: s_setpc_b64 s[30:31]
+ %gep = getelementptr half, ptr addrspace(1) %ptr, i64 1023
+ %result = atomicrmw fadd ptr addrspace(1) %gep, half %val syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0
+ ret half %result
+}
+
+define void @global_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: global_load_b32 v4, v[0:1], off offset:2046
+; GFX12-NEXT: s_mov_b32 s0, 0
+; GFX12-NEXT: .LBB51_1: ; %atomicrmw.start
+; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: v_add_f16_e32 v3, v4, v2
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX12-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
+; GFX12-NEXT: s_wait_storecnt 0x0
+; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 th:TH_ATOMIC_RETURN
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_cbranch_execnz .LBB51_1
+; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: global_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
+; GFX940: ; %bb.0:
+; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT: global_load_dword v5, v[0:1], off offset:2046
+; GFX940-NEXT: s_mov_b64 s[0:1], 0
+; GFX940-NEXT: s_mov_b32 s2, 0xffff0000
+; GFX940-NEXT: .LBB51_1: ; %atomicrmw.start
+; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX940-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NEXT: v_add_f16_e32 v3, v5, v2
+; GFX940-NEXT: v_and_or_b32 v4, v5, s2, v3
+; GFX940-NEXT: buffer_wbl2 sc1
+; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2046 sc0
+; GFX940-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NEXT: buffer_inv sc1
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX940-NEXT: v_mov_b32_e32 v5, v3
+; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX940-NEXT: s_cbranch_execnz .LBB51_1
+; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX940-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX940-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: global_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: global_load_b32 v4, v[0:1], off offset:2046
+; GFX11-NEXT: s_mov_b32 s0, 0
+; GFX11-NEXT: .LBB51_1: ; %atomicrmw.start
+; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_add_f16_e32 v3, v4, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX11-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 glc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: buffer_gl1_inv
+; GFX11-NEXT: buffer_gl0_inv
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT: s_cbranch_execnz .LBB51_1
+; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: global_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:2046
+; GFX10-NEXT: s_mov_b32 s4, 0
+; GFX10-NEXT: .LBB51_1: ; %atomicrmw.start
+; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_add_f16_e32 v3, v4, v2
+; GFX10-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX10-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2046 glc
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: buffer_gl1_inv
+; GFX10-NEXT: buffer_gl0_inv
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX10-NEXT: v_mov_b32_e32 v4, v3
+; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT: s_cbranch_execnz .LBB51_1
+; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: global_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
+; GFX90A: ; %bb.0:
+; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:2046
+; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: s_mov_b32 s6, 0xffff0000
+; GFX90A-NEXT: .LBB51_1: ; %atomicrmw.start
+; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: v_add_f16_e32 v3, v5, v2
+; GFX90A-NEXT: v_and_or_b32 v4, v5, s6, v3
+; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2046 glc
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: buffer_wbinvl1
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v5, v3
+; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: s_cbranch_execnz .LBB51_1
+; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: global_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
+; GFX908: ; %bb.0:
+; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT: global_load_dword v4, v[0:1], off offset:2046
+; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: s_mov_b32 s6, 0xffff0000
+; GFX908-NEXT: .LBB51_1: ; %atomicrmw.start
+; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: v_add_f16_e32 v3, v4, v2
+; GFX908-NEXT: v_and_or_b32 v3, v4, s6, v3
+; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2046 glc
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: buffer_wbinvl1
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT: v_mov_b32_e32 v4, v3
+; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX908-NEXT: s_cbranch_execnz .LBB51_1
+; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: global_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fe, v0
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT: flat_load_dword v4, v[0:1]
+; GFX8-NEXT: s_mov_b64 s[4:5], 0
+; GFX8-NEXT: .LBB51_1: ; %atomicrmw.start
+; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_add_f16_e32 v3, v4, v2
+; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
+; GFX8-NEXT: v_or_b32_e32 v3, v5, v3
+; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: buffer_wbinvl1
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT: v_mov_b32_e32 v4, v3
+; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX8-NEXT: s_cbranch_execnz .LBB51_1
+; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: global_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: s_mov_b32 s6, 0
+; GFX7-NEXT: s_mov_b32 s7, 0xf000
+; GFX7-NEXT: s_mov_b32 s4, s6
+; GFX7-NEXT: s_mov_b32 s5, s6
+; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046
+; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT: s_mov_b64 s[8:9], 0
+; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v2
+; GFX7-NEXT: .LBB51_1: ; %atomicrmw.start
+; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v3
+; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
+; GFX7-NEXT: v_add_f32_e32 v2, v2, v4
+; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT: v_or_b32_e32 v2, v5, v2
+; GFX7-NEXT: v_mov_b32_e32 v6, v3
+; GFX7-NEXT: v_mov_b32_e32 v5, v2
+; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2046 glc
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: buffer_wbinvl1
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v3
+; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT: v_mov_b32_e32 v3, v5
+; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
+; GFX7-NEXT: s_cbranch_execnz .LBB51_1
+; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX6-LABEL: global_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT: s_mov_b32 s6, 0
+; GFX6-NEXT: s_mov_b32 s7, 0xf000
+; GFX6-NEXT: s_mov_b32 s4, s6
+; GFX6-NEXT: s_mov_b32 s5, s6
+; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046
+; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX6-NEXT: s_mov_b64 s[8:9], 0
+; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v2
+; GFX6-NEXT: .LBB51_1: ; %atomicrmw.start
+; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v3
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
+; GFX6-NEXT: v_add_f32_e32 v2, v2, v4
+; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX6-NEXT: v_or_b32_e32 v2, v5, v2
+; GFX6-NEXT: v_mov_b32_e32 v6, v3
+; GFX6-NEXT: v_mov_b32_e32 v5, v2
+; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2046 glc
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: buffer_wbinvl1
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v3
+; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT: v_mov_b32_e32 v3, v5
+; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
+; GFX6-NEXT: s_cbranch_execnz .LBB51_1
+; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: s_setpc_b64 s[30:31]
+ %gep = getelementptr half, ptr addrspace(1) %ptr, i64 1023
+ %unused = atomicrmw fadd ptr addrspace(1) %gep, half %val syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0
+ ret void
+}
+
+define half @global_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 {
+; GFX12-LABEL: global_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX12-NEXT: s_mov_b32 s0, 0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX12-NEXT: global_load_b32 v5, v[0:1], off
+; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_not_b32_e32 v4, v4
+; GFX12-NEXT: .LBB52_1: ; %atomicrmw.start
+; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: v_mov_b32_e32 v6, v5
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX12-NEXT: v_add_f16_e32 v5, v5, v2
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX12-NEXT: s_wait_storecnt 0x0
+; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: global_inv scope:SCOPE_SYS
+; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_cbranch_execnz .LBB52_1
+; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: global_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX940: ; %bb.0:
+; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT: s_mov_b64 s[0:1], 0x7fe
+; GFX940-NEXT: v_lshl_add_u64 v[6:7], v[0:1], 0, s[0:1]
+; GFX940-NEXT: v_and_b32_e32 v0, -4, v6
+; GFX940-NEXT: v_mov_b32_e32 v1, v7
+; GFX940-NEXT: global_load_dword v4, v[0:1], off
+; GFX940-NEXT: v_and_b32_e32 v3, 3, v6
+; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX940-NEXT: s_mov_b32 s0, 0xffff
+; GFX940-NEXT: v_lshlrev_b32_e64 v5, v3, s0
+; GFX940-NEXT: v_not_b32_e32 v5, v5
+; GFX940-NEXT: s_mov_b64 s[0:1], 0
+; GFX940-NEXT: .LBB52_1: ; %atomicrmw.start
+; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX940-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NEXT: v_mov_b32_e32 v7, v4
+; GFX940-NEXT: v_lshrrev_b32_e32 v4, v3, v7
+; GFX940-NEXT: v_add_f16_e32 v4, v4, v2
+; GFX940-NEXT: v_lshlrev_b32_e32 v4, v3, v4
+; GFX940-NEXT: v_and_or_b32 v6, v7, v5, v4
+; GFX940-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-NEXT: global_atomic_cmpswap v4, v[0:1], v[6:7], off sc0 sc1
+; GFX940-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NEXT: buffer_inv sc0 sc1
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7
+; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX940-NEXT: s_cbranch_execnz .LBB52_1
+; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX940-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v4
+; GFX940-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: global_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX11-NEXT: s_mov_b32 s0, 0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX11-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX11-NEXT: global_load_b32 v5, v[0:1], off
+; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_not_b32_e32 v4, v4
+; GFX11-NEXT: .LBB52_1: ; %atomicrmw.start
+; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_mov_b32_e32 v6, v5
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX11-NEXT: v_add_f16_e32 v5, v5, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: buffer_gl1_inv
+; GFX11-NEXT: buffer_gl0_inv
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT: s_cbranch_execnz .LBB52_1
+; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: global_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX10-NEXT: s_mov_b32 s4, 0
+; GFX10-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX10-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX10-NEXT: global_load_dword v5, v[0:1], off
+; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX10-NEXT: v_not_b32_e32 v4, v4
+; GFX10-NEXT: .LBB52_1: ; %atomicrmw.start
+; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_mov_b32_e32 v6, v5
+; GFX10-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX10-NEXT: v_add_f16_e32 v5, v5, v2
+; GFX10-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX10-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: global_atomic_cmpswap v5, v[0:1], v[5:6], off glc
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: buffer_gl1_inv
+; GFX10-NEXT: buffer_gl0_inv
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT: s_cbranch_execnz .LBB52_1
+; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: global_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX90A: ; %bb.0:
+; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0
+; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX90A-NEXT: global_load_dword v4, v[0:1], off
+; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX90A-NEXT: s_mov_b32 s4, 0xffff
+; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v3, s4
+; GFX90A-NEXT: v_not_b32_e32 v5, v5
+; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: .LBB52_1: ; %atomicrmw.start
+; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v7, v4
+; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v7
+; GFX90A-NEXT: v_add_f16_e32 v4, v4, v2
+; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4
+; GFX90A-NEXT: v_and_or_b32 v6, v7, v5, v4
+; GFX90A-NEXT: buffer_wbl2
+; GFX90A-NEXT: global_atomic_cmpswap v4, v[0:1], v[6:7], off glc
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: buffer_invl2
+; GFX90A-NEXT: buffer_wbinvl1
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7
+; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: s_cbranch_execnz .LBB52_1
+; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v4
+; GFX90A-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: global_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX908: ; %bb.0:
+; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0
+; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX908-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX908-NEXT: global_load_dword v4, v[0:1], off
+; GFX908-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX908-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX908-NEXT: s_mov_b32 s4, 0xffff
+; GFX908-NEXT: v_lshlrev_b32_e64 v5, v3, s4
+; GFX908-NEXT: v_not_b32_e32 v5, v5
+; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: .LBB52_1: ; %atomicrmw.start
+; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: v_mov_b32_e32 v7, v4
+; GFX908-NEXT: v_lshrrev_b32_e32 v4, v3, v7
+; GFX908-NEXT: v_add_f16_e32 v4, v4, v2
+; GFX908-NEXT: v_lshlrev_b32_e32 v4, v3, v4
+; GFX908-NEXT: v_and_or_b32 v6, v7, v5, v4
+; GFX908-NEXT: global_atomic_cmpswap v4, v[0:1], v[6:7], off glc
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: buffer_wbinvl1
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7
+; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX908-NEXT: s_cbranch_execnz .LBB52_1
+; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v4
+; GFX908-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: global_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fe, v0
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX8-NEXT: flat_load_dword v5, v[0:1]
+; GFX8-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX8-NEXT: s_mov_b32 s4, 0xffff
+; GFX8-NEXT: v_lshlrev_b32_e64 v4, v3, s4
+; GFX8-NEXT: v_not_b32_e32 v4, v4
+; GFX8-NEXT: s_mov_b64 s[4:5], 0
+; GFX8-NEXT: .LBB52_1: ; %atomicrmw.start
+; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v6, v5
+; GFX8-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX8-NEXT: v_add_f16_e32 v5, v5, v2
+; GFX8-NEXT: v_and_b32_e32 v7, v6, v4
+; GFX8-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX8-NEXT: v_or_b32_e32 v5, v7, v5
+; GFX8-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: buffer_wbinvl1
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6
+; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX8-NEXT: s_cbranch_execnz .LBB52_1
+; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: global_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0
+; GFX7-NEXT: s_mov_b32 s6, 0
+; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX7-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX7-NEXT: s_mov_b32 s7, 0xf000
+; GFX7-NEXT: s_mov_b32 s4, s6
+; GFX7-NEXT: s_mov_b32 s5, s6
+; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v6, 3, v4
+; GFX7-NEXT: s_mov_b64 s[8:9], 0
+; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v2
+; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v6
+; GFX7-NEXT: v_not_b32_e32 v8, v2
+; GFX7-NEXT: .LBB52_1: ; %atomicrmw.start
+; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_lshrrev_b32_e32 v2, v6, v3
+; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GFX7-NEXT: v_and_b32_e32 v4, v3, v8
+; GFX7-NEXT: v_add_f32_e32 v2, v2, v7
+; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, v6, v2
+; GFX7-NEXT: v_or_b32_e32 v2, v4, v2
+; GFX7-NEXT: v_mov_b32_e32 v5, v3
+; GFX7-NEXT: v_mov_b32_e32 v4, v2
+; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: buffer_wbinvl1
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT: v_mov_b32_e32 v3, v4
+; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
+; GFX7-NEXT: s_cbranch_execnz .LBB52_1
+; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, v6, v4
+; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX6-LABEL: global_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0
+; GFX6-NEXT: s_mov_b32 s6, 0
+; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX6-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX6-NEXT: s_mov_b32 s7, 0xf000
+; GFX6-NEXT: s_mov_b32 s4, s6
+; GFX6-NEXT: s_mov_b32 s5, s6
+; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
+; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX6-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX6-NEXT: v_lshlrev_b32_e32 v6, 3, v4
+; GFX6-NEXT: s_mov_b64 s[8:9], 0
+; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v2
+; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v6
+; GFX6-NEXT: v_not_b32_e32 v8, v2
+; GFX6-NEXT: .LBB52_1: ; %atomicrmw.start
+; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_lshrrev_b32_e32 v2, v6, v3
+; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_and_b32_e32 v4, v3, v8
+; GFX6-NEXT: v_add_f32_e32 v2, v2, v7
+; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX6-NEXT: v_lshlrev_b32_e32 v2, v6, v2
+; GFX6-NEXT: v_or_b32_e32 v2, v4, v2
+; GFX6-NEXT: v_mov_b32_e32 v5, v3
+; GFX6-NEXT: v_mov_b32_e32 v4, v2
+; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: buffer_wbinvl1
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT: v_mov_b32_e32 v3, v4
+; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
+; GFX6-NEXT: s_cbranch_execnz .LBB52_1
+; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
+; GFX6-NEXT: v_lshrrev_b32_e32 v0, v6, v4
+; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: s_setpc_b64 s[30:31]
+ %gep = getelementptr half, ptr addrspace(1) %ptr, i64 1023
+ %result = atomicrmw fadd ptr addrspace(1) %gep, half %val seq_cst, !amdgpu.no.fine.grained.memory !0
+ ret half %result
+}
+
+define void @global_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 {
+; GFX12-LABEL: global_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX12-NEXT: s_mov_b32 s0, 0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX12-NEXT: global_load_b32 v4, v[0:1], off
+; GFX12-NEXT: v_lshlrev_b32_e32 v5, 3, v3
+; GFX12-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_not_b32_e32 v6, v3
+; GFX12-NEXT: .LBB53_1: ; %atomicrmw.start
+; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: v_lshrrev_b32_e32 v3, v5, v4
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_add_f16_e32 v3, v3, v2
+; GFX12-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3
+; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3
+; GFX12-NEXT: s_wait_storecnt 0x0
+; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: global_inv scope:SCOPE_SYS
+; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_cbranch_execnz .LBB53_1
+; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: global_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX940: ; %bb.0:
+; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT: s_mov_b64 s[0:1], 0x7fe
+; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
+; GFX940-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX940-NEXT: v_mov_b32_e32 v1, v5
+; GFX940-NEXT: global_load_dword v5, v[0:1], off
+; GFX940-NEXT: v_and_b32_e32 v3, 3, v4
+; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX940-NEXT: s_mov_b32 s0, 0xffff
+; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0
+; GFX940-NEXT: v_not_b32_e32 v6, v4
+; GFX940-NEXT: s_mov_b64 s[0:1], 0
+; GFX940-NEXT: .LBB53_1: ; %atomicrmw.start
+; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX940-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NEXT: v_lshrrev_b32_e32 v4, v3, v5
+; GFX940-NEXT: v_add_f16_e32 v4, v4, v2
+; GFX940-NEXT: v_lshlrev_b32_e32 v4, v3, v4
+; GFX940-NEXT: v_and_or_b32 v4, v5, v6, v4
+; GFX940-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off sc0 sc1
+; GFX940-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NEXT: buffer_inv sc0 sc1
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5
+; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX940-NEXT: v_mov_b32_e32 v5, v4
+; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX940-NEXT: s_cbranch_execnz .LBB53_1
+; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX940-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX940-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: global_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX11-NEXT: s_mov_b32 s0, 0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX11-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX11-NEXT: global_load_b32 v4, v[0:1], off
+; GFX11-NEXT: v_lshlrev_b32_e32 v5, 3, v3
+; GFX11-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_not_b32_e32 v6, v3
+; GFX11-NEXT: .LBB53_1: ; %atomicrmw.start
+; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_lshrrev_b32_e32 v3, v5, v4
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_add_f16_e32 v3, v3, v2
+; GFX11-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_lshlrev_b32_e32 v3, v5, v3
+; GFX11-NEXT: v_and_or_b32 v3, v4, v6, v3
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: buffer_gl1_inv
+; GFX11-NEXT: buffer_gl0_inv
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT: s_cbranch_execnz .LBB53_1
+; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: global_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX10-NEXT: s_mov_b32 s4, 0
+; GFX10-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX10-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX10-NEXT: global_load_dword v4, v[0:1], off
+; GFX10-NEXT: v_lshlrev_b32_e32 v5, 3, v3
+; GFX10-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX10-NEXT: v_not_b32_e32 v6, v3
+; GFX10-NEXT: .LBB53_1: ; %atomicrmw.start
+; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_lshrrev_b32_e32 v3, v5, v4
+; GFX10-NEXT: v_add_f16_e32 v3, v3, v2
+; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX10-NEXT: v_and_or_b32 v3, v4, v6, v3
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: buffer_gl1_inv
+; GFX10-NEXT: buffer_gl0_inv
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX10-NEXT: v_mov_b32_e32 v4, v3
+; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT: s_cbranch_execnz .LBB53_1
+; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: global_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX90A: ; %bb.0:
+; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0
+; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX90A-NEXT: global_load_dword v5, v[0:1], off
+; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX90A-NEXT: s_mov_b32 s4, 0xffff
+; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4
+; GFX90A-NEXT: v_not_b32_e32 v6, v4
+; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: .LBB53_1: ; %atomicrmw.start
+; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v3, v5
+; GFX90A-NEXT: v_add_f16_e32 v4, v4, v2
+; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v3, v4
+; GFX90A-NEXT: v_and_or_b32 v4, v5, v6, v4
+; GFX90A-NEXT: buffer_wbl2
+; GFX90A-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off glc
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: buffer_invl2
+; GFX90A-NEXT: buffer_wbinvl1
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5
+; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v5, v4
+; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: s_cbranch_execnz .LBB53_1
+; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: global_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX908: ; %bb.0:
+; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0
+; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX908-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX908-NEXT: global_load_dword v4, v[0:1], off
+; GFX908-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX908-NEXT: v_lshlrev_b32_e32 v5, 3, v3
+; GFX908-NEXT: s_mov_b32 s4, 0xffff
+; GFX908-NEXT: v_lshlrev_b32_e64 v3, v5, s4
+; GFX908-NEXT: v_not_b32_e32 v6, v3
+; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: .LBB53_1: ; %atomicrmw.start
+; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: v_lshrrev_b32_e32 v3, v5, v4
+; GFX908-NEXT: v_add_f16_e32 v3, v3, v2
+; GFX908-NEXT: v_lshlrev_b32_e32 v3, v5, v3
+; GFX908-NEXT: v_and_or_b32 v3, v4, v6, v3
+; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: buffer_wbinvl1
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT: v_mov_b32_e32 v4, v3
+; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX908-NEXT: s_cbranch_execnz .LBB53_1
+; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: global_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fe, v0
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX8-NEXT: flat_load_dword v4, v[0:1]
+; GFX8-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX8-NEXT: v_lshlrev_b32_e32 v5, 3, v3
+; GFX8-NEXT: s_mov_b32 s4, 0xffff
+; GFX8-NEXT: v_lshlrev_b32_e64 v3, v5, s4
+; GFX8-NEXT: v_not_b32_e32 v6, v3
+; GFX8-NEXT: s_mov_b64 s[4:5], 0
+; GFX8-NEXT: .LBB53_1: ; %atomicrmw.start
+; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_lshrrev_b32_e32 v3, v5, v4
+; GFX8-NEXT: v_add_f16_e32 v3, v3, v2
+; GFX8-NEXT: v_and_b32_e32 v7, v4, v6
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, v5, v3
+; GFX8-NEXT: v_or_b32_e32 v3, v7, v3
+; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: buffer_wbinvl1
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT: v_mov_b32_e32 v4, v3
+; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX8-NEXT: s_cbranch_execnz .LBB53_1
+; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: global_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0
+; GFX7-NEXT: s_mov_b32 s6, 0
+; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX7-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX7-NEXT: s_mov_b32 s7, 0xf000
+; GFX7-NEXT: s_mov_b32 s4, s6
+; GFX7-NEXT: s_mov_b32 s5, s6
+; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX7-NEXT: s_mov_b64 s[8:9], 0
+; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2
+; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v4
+; GFX7-NEXT: v_not_b32_e32 v6, v2
+; GFX7-NEXT: .LBB53_1: ; %atomicrmw.start
+; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GFX7-NEXT: v_and_b32_e32 v7, v3, v6
+; GFX7-NEXT: v_add_f32_e32 v2, v2, v5
+; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, v4, v2
+; GFX7-NEXT: v_or_b32_e32 v2, v7, v2
+; GFX7-NEXT: v_mov_b32_e32 v8, v3
+; GFX7-NEXT: v_mov_b32_e32 v7, v2
+; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: buffer_wbinvl1
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3
+; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT: v_mov_b32_e32 v3, v7
+; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
+; GFX7-NEXT: s_cbranch_execnz .LBB53_1
+; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX6-LABEL: global_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0
+; GFX6-NEXT: s_mov_b32 s6, 0
+; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX6-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX6-NEXT: s_mov_b32 s7, 0xf000
+; GFX6-NEXT: s_mov_b32 s4, s6
+; GFX6-NEXT: s_mov_b32 s5, s6
+; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
+; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX6-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX6-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX6-NEXT: s_mov_b64 s[8:9], 0
+; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v2
+; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v4
+; GFX6-NEXT: v_not_b32_e32 v6, v2
+; GFX6-NEXT: .LBB53_1: ; %atomicrmw.start
+; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_and_b32_e32 v7, v3, v6
+; GFX6-NEXT: v_add_f32_e32 v2, v2, v5
+; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX6-NEXT: v_lshlrev_b32_e32 v2, v4, v2
+; GFX6-NEXT: v_or_b32_e32 v2, v7, v2
+; GFX6-NEXT: v_mov_b32_e32 v8, v3
+; GFX6-NEXT: v_mov_b32_e32 v7, v2
+; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: buffer_wbinvl1
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3
+; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT: v_mov_b32_e32 v3, v7
+; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
+; GFX6-NEXT: s_cbranch_execnz .LBB53_1
+; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: s_setpc_b64 s[30:31]
+ %gep = getelementptr half, ptr addrspace(1) %ptr, i64 1023
+ %unused = atomicrmw fadd ptr addrspace(1) %gep, half %val seq_cst, !amdgpu.no.fine.grained.memory !0
+ ret void
+}
+
+; --------------------------------------------------------------------
+; bfloat
+; --------------------------------------------------------------------
+
+define bfloat @global_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2
+; GFX12-NEXT: s_mov_b32 s0, 0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX12-NEXT: global_load_b32 v5, v[0:1], off
+; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_not_b32_e32 v4, v4
+; GFX12-NEXT: .LBB54_1: ; %atomicrmw.start
+; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: v_mov_b32_e32 v6, v5
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX12-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_add_f32_e32 v5, v5, v2
+; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX12-NEXT: s_wait_storecnt 0x0
+; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_cbranch_execnz .LBB54_1
+; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: global_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory:
+; GFX940: ; %bb.0:
+; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT: v_mov_b32_e32 v3, v0
+; GFX940-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX940-NEXT: global_load_dword v5, v[0:1], off
+; GFX940-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX940-NEXT: s_mov_b32 s0, 0xffff
+; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0
+; GFX940-NEXT: v_not_b32_e32 v4, v4
+; GFX940-NEXT: s_mov_b64 s[0:1], 0
+; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX940-NEXT: s_movk_i32 s2, 0x7fff
+; GFX940-NEXT: .LBB54_1: ; %atomicrmw.start
+; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX940-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NEXT: v_mov_b32_e32 v7, v5
+; GFX940-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX940-NEXT: s_nop 0
+; GFX940-NEXT: v_add_f32_e32 v5, v5, v2
+; GFX940-NEXT: v_bfe_u32 v6, v5, 16, 1
+; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX940-NEXT: v_add3_u32 v6, v6, v5, s2
+; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX940-NEXT: s_nop 1
+; GFX940-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc
+; GFX940-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX940-NEXT: v_and_or_b32 v6, v7, v4, v5
+; GFX940-NEXT: buffer_wbl2 sc1
+; GFX940-NEXT: global_atomic_cmpswap v5, v[0:1], v[6:7], off sc0
+; GFX940-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NEXT: buffer_inv sc1
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7
+; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX940-NEXT: s_cbranch_execnz .LBB54_1
+; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX940-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX940-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: global_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2
+; GFX11-NEXT: s_mov_b32 s0, 0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX11-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX11-NEXT: global_load_b32 v5, v[0:1], off
+; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_not_b32_e32 v4, v4
+; GFX11-NEXT: .p2align 6
+; GFX11-NEXT: .LBB54_1: ; %atomicrmw.start
+; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_mov_b32_e32 v6, v5
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_add_f32_e32 v5, v5, v2
+; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: buffer_gl1_inv
+; GFX11-NEXT: buffer_gl0_inv
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT: s_cbranch_execnz .LBB54_1
+; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: global_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_mov_b32_e32 v3, v0
+; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX10-NEXT: s_mov_b32 s4, 0
+; GFX10-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX10-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX10-NEXT: global_load_dword v5, v[0:1], off
+; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX10-NEXT: v_not_b32_e32 v4, v4
+; GFX10-NEXT: .LBB54_1: ; %atomicrmw.start
+; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_mov_b32_e32 v6, v5
+; GFX10-NEXT: v_lshrrev_b32_sdwa v5, v3, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX10-NEXT: v_add_f32_e32 v5, v5, v2
+; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX10-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: global_atomic_cmpswap v5, v[0:1], v[5:6], off glc
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: buffer_gl1_inv
+; GFX10-NEXT: buffer_gl0_inv
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT: s_cbranch_execnz .LBB54_1
+; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: global_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory:
+; GFX90A: ; %bb.0:
+; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v3, v0
+; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX90A-NEXT: global_load_dword v5, v[0:1], off
+; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX90A-NEXT: s_mov_b32 s4, 0xffff
+; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4
+; GFX90A-NEXT: v_not_b32_e32 v4, v4
+; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX90A-NEXT: s_movk_i32 s6, 0x7fff
+; GFX90A-NEXT: .LBB54_1: ; %atomicrmw.start
+; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v7, v5
+; GFX90A-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX90A-NEXT: v_add_f32_e32 v5, v5, v2
+; GFX90A-NEXT: v_bfe_u32 v6, v5, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX90A-NEXT: v_add3_u32 v6, v6, v5, s6
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX90A-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc
+; GFX90A-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX90A-NEXT: v_and_or_b32 v6, v7, v4, v5
+; GFX90A-NEXT: global_atomic_cmpswap v5, v[0:1], v[6:7], off glc
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: buffer_wbinvl1
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7
+; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: s_cbranch_execnz .LBB54_1
+; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX90A-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: global_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory:
+; GFX908: ; %bb.0:
+; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT: v_mov_b32_e32 v3, v0
+; GFX908-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX908-NEXT: global_load_dword v5, v[0:1], off
+; GFX908-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX908-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX908-NEXT: s_mov_b32 s4, 0xffff
+; GFX908-NEXT: v_lshlrev_b32_e64 v4, v3, s4
+; GFX908-NEXT: v_not_b32_e32 v4, v4
+; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX908-NEXT: s_movk_i32 s6, 0x7fff
+; GFX908-NEXT: .LBB54_1: ; %atomicrmw.start
+; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: v_mov_b32_e32 v6, v5
+; GFX908-NEXT: v_lshrrev_b32_sdwa v5, v3, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX908-NEXT: v_add_f32_e32 v5, v5, v2
+; GFX908-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX908-NEXT: v_add3_u32 v7, v7, v5, s6
+; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX908-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc
+; GFX908-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX908-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX908-NEXT: global_atomic_cmpswap v5, v[0:1], v[5:6], off glc
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: buffer_wbinvl1
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6
+; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX908-NEXT: s_cbranch_execnz .LBB54_1
+; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX908-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: global_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v3, v0
+; GFX8-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX8-NEXT: flat_load_dword v5, v[0:1]
+; GFX8-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX8-NEXT: s_mov_b32 s4, 0xffff
+; GFX8-NEXT: v_lshlrev_b32_e64 v4, v3, s4
+; GFX8-NEXT: v_not_b32_e32 v4, v4
+; GFX8-NEXT: s_mov_b64 s[4:5], 0
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX8-NEXT: .LBB54_1: ; %atomicrmw.start
+; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v6, v5
+; GFX8-NEXT: v_lshrrev_b32_sdwa v5, v3, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_add_f32_e32 v5, v5, v2
+; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5
+; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8
+; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX8-NEXT: v_and_b32_e32 v7, v6, v4
+; GFX8-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT: v_or_b32_e32 v5, v7, v5
+; GFX8-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: buffer_wbinvl1
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6
+; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX8-NEXT: s_cbranch_execnz .LBB54_1
+; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: global_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v3, v0
+; GFX7-NEXT: s_mov_b32 s6, 0
+; GFX7-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX7-NEXT: s_mov_b32 s7, 0xf000
+; GFX7-NEXT: s_mov_b32 s4, s6
+; GFX7-NEXT: s_mov_b32 s5, s6
+; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX7-NEXT: v_lshlrev_b32_e32 v5, 3, v3
+; GFX7-NEXT: v_lshl_b32_e32 v3, 0xffff, v5
+; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT: v_not_b32_e32 v6, v3
+; GFX7-NEXT: s_mov_b64 s[8:9], 0
+; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v2
+; GFX7-NEXT: .LBB54_1: ; %atomicrmw.start
+; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_lshrrev_b32_e32 v2, v5, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX7-NEXT: v_add_f32_e32 v2, v2, v7
+; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX7-NEXT: v_and_b32_e32 v3, v4, v6
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, v5, v2
+; GFX7-NEXT: v_or_b32_e32 v3, v3, v2
+; GFX7-NEXT: v_mov_b32_e32 v2, v3
+; GFX7-NEXT: v_mov_b32_e32 v3, v4
+; GFX7-NEXT: buffer_atomic_cmpswap v[2:3], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: buffer_wbinvl1
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4
+; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT: v_mov_b32_e32 v4, v2
+; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
+; GFX7-NEXT: s_cbranch_execnz .LBB54_1
+; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, v5, v2
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX6-LABEL: global_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v3, v0
+; GFX6-NEXT: s_mov_b32 s6, 0
+; GFX6-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX6-NEXT: s_mov_b32 s7, 0xf000
+; GFX6-NEXT: s_mov_b32 s4, s6
+; GFX6-NEXT: s_mov_b32 s5, s6
+; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64
+; GFX6-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX6-NEXT: v_lshlrev_b32_e32 v5, 3, v3
+; GFX6-NEXT: v_lshl_b32_e32 v3, 0xffff, v5
+; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GFX6-NEXT: v_not_b32_e32 v6, v3
+; GFX6-NEXT: s_mov_b64 s[8:9], 0
+; GFX6-NEXT: v_and_b32_e32 v7, 0xffff0000, v2
+; GFX6-NEXT: .LBB54_1: ; %atomicrmw.start
+; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; GFX6-NEXT: v_lshrrev_b32_e32 v2, v5, v4
+; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX6-NEXT: v_add_f32_e32 v2, v2, v7
+; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX6-NEXT: v_and_b32_e32 v3, v4, v6
+; GFX6-NEXT: v_lshlrev_b32_e32 v2, v5, v2
+; GFX6-NEXT: v_or_b32_e32 v3, v3, v2
+; GFX6-NEXT: v_mov_b32_e32 v2, v3
+; GFX6-NEXT: v_mov_b32_e32 v3, v4
+; GFX6-NEXT: buffer_atomic_cmpswap v[2:3], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: buffer_wbinvl1
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4
+; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT: v_mov_b32_e32 v4, v2
+; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
+; GFX6-NEXT: s_cbranch_execnz .LBB54_1
+; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
+; GFX6-NEXT: v_lshrrev_b32_e32 v0, v5, v2
+; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: s_setpc_b64 s[30:31]
+ %result = atomicrmw fadd ptr addrspace(1) %ptr, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+ ret bfloat %result
+}
+
+define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX12-NEXT: s_mov_b32 s0, 0
+; GFX12-NEXT: global_load_b32 v5, v[0:1], off
+; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_not_b32_e32 v4, v4
+; GFX12-NEXT: .LBB55_1: ; %atomicrmw.start
+; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: v_mov_b32_e32 v6, v5
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX12-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_add_f32_e32 v5, v5, v2
+; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX12-NEXT: s_wait_storecnt 0x0
+; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_cbranch_execnz .LBB55_1
+; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX940: ; %bb.0:
+; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT: s_mov_b64 s[0:1], 0x7fe
+; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
+; GFX940-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX940-NEXT: v_mov_b32_e32 v1, v5
+; GFX940-NEXT: global_load_dword v5, v[0:1], off
+; GFX940-NEXT: v_and_b32_e32 v3, 3, v4
+; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX940-NEXT: s_mov_b32 s0, 0xffff
+; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0
+; GFX940-NEXT: v_not_b32_e32 v4, v4
+; GFX940-NEXT: s_mov_b64 s[0:1], 0
+; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX940-NEXT: s_movk_i32 s2, 0x7fff
+; GFX940-NEXT: .LBB55_1: ; %atomicrmw.start
+; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX940-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NEXT: v_mov_b32_e32 v7, v5
+; GFX940-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX940-NEXT: s_nop 0
+; GFX940-NEXT: v_add_f32_e32 v5, v5, v2
+; GFX940-NEXT: v_bfe_u32 v6, v5, 16, 1
+; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX940-NEXT: v_add3_u32 v6, v6, v5, s2
+; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX940-NEXT: s_nop 1
+; GFX940-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc
+; GFX940-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX940-NEXT: v_and_or_b32 v6, v7, v4, v5
+; GFX940-NEXT: buffer_wbl2 sc1
+; GFX940-NEXT: global_atomic_cmpswap v5, v[0:1], v[6:7], off sc0
+; GFX940-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NEXT: buffer_inv sc1
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7
+; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX940-NEXT: s_cbranch_execnz .LBB55_1
+; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX940-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX940-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX11-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX11-NEXT: s_mov_b32 s0, 0
+; GFX11-NEXT: global_load_b32 v5, v[0:1], off
+; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_not_b32_e32 v4, v4
+; GFX11-NEXT: .p2align 6
+; GFX11-NEXT: .LBB55_1: ; %atomicrmw.start
+; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_mov_b32_e32 v6, v5
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_add_f32_e32 v5, v5, v2
+; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: buffer_gl1_inv
+; GFX11-NEXT: buffer_gl0_inv
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT: s_cbranch_execnz .LBB55_1
+; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX10-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX10-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX10-NEXT: s_mov_b32 s4, 0
+; GFX10-NEXT: global_load_dword v5, v[0:1], off
+; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX10-NEXT: v_not_b32_e32 v4, v4
+; GFX10-NEXT: .LBB55_1: ; %atomicrmw.start
+; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_mov_b32_e32 v6, v5
+; GFX10-NEXT: v_lshrrev_b32_sdwa v5, v3, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX10-NEXT: v_add_f32_e32 v5, v5, v2
+; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX10-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: global_atomic_cmpswap v5, v[0:1], v[5:6], off glc
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: buffer_gl1_inv
+; GFX10-NEXT: buffer_gl0_inv
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT: s_cbranch_execnz .LBB55_1
+; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX90A: ; %bb.0:
+; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0
+; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX90A-NEXT: global_load_dword v5, v[0:1], off
+; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX90A-NEXT: s_mov_b32 s4, 0xffff
+; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4
+; GFX90A-NEXT: v_not_b32_e32 v4, v4
+; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX90A-NEXT: s_movk_i32 s6, 0x7fff
+; GFX90A-NEXT: .LBB55_1: ; %atomicrmw.start
+; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v7, v5
+; GFX90A-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX90A-NEXT: v_add_f32_e32 v5, v5, v2
+; GFX90A-NEXT: v_bfe_u32 v6, v5, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX90A-NEXT: v_add3_u32 v6, v6, v5, s6
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX90A-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc
+; GFX90A-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX90A-NEXT: v_and_or_b32 v6, v7, v4, v5
+; GFX90A-NEXT: global_atomic_cmpswap v5, v[0:1], v[6:7], off glc
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: buffer_wbinvl1
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7
+; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: s_cbranch_execnz .LBB55_1
+; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX90A-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX908: ; %bb.0:
+; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0
+; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX908-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX908-NEXT: global_load_dword v5, v[0:1], off
+; GFX908-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX908-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX908-NEXT: s_mov_b32 s4, 0xffff
+; GFX908-NEXT: v_lshlrev_b32_e64 v4, v3, s4
+; GFX908-NEXT: v_not_b32_e32 v4, v4
+; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX908-NEXT: s_movk_i32 s6, 0x7fff
+; GFX908-NEXT: .LBB55_1: ; %atomicrmw.start
+; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: v_mov_b32_e32 v6, v5
+; GFX908-NEXT: v_lshrrev_b32_sdwa v5, v3, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX908-NEXT: v_add_f32_e32 v5, v5, v2
+; GFX908-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX908-NEXT: v_add3_u32 v7, v7, v5, s6
+; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX908-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc
+; GFX908-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX908-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX908-NEXT: global_atomic_cmpswap v5, v[0:1], v[5:6], off glc
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: buffer_wbinvl1
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6
+; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX908-NEXT: s_cbranch_execnz .LBB55_1
+; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX908-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fe, v0
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX8-NEXT: flat_load_dword v5, v[0:1]
+; GFX8-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX8-NEXT: s_mov_b32 s4, 0xffff
+; GFX8-NEXT: v_lshlrev_b32_e64 v4, v3, s4
+; GFX8-NEXT: v_not_b32_e32 v4, v4
+; GFX8-NEXT: s_mov_b64 s[4:5], 0
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX8-NEXT: .LBB55_1: ; %atomicrmw.start
+; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v6, v5
+; GFX8-NEXT: v_lshrrev_b32_sdwa v5, v3, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_add_f32_e32 v5, v5, v2
+; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5
+; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8
+; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX8-NEXT: v_and_b32_e32 v7, v6, v4
+; GFX8-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT: v_or_b32_e32 v5, v7, v5
+; GFX8-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: buffer_wbinvl1
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6
+; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX8-NEXT: s_cbranch_execnz .LBB55_1
+; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0
+; GFX7-NEXT: s_mov_b32 s6, 0
+; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX7-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX7-NEXT: s_mov_b32 s7, 0xf000
+; GFX7-NEXT: s_mov_b32 s4, s6
+; GFX7-NEXT: s_mov_b32 s5, s6
+; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v6, 3, v4
+; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v6
+; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT: v_not_b32_e32 v7, v4
+; GFX7-NEXT: s_mov_b64 s[8:9], 0
+; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v2
+; GFX7-NEXT: .LBB55_1: ; %atomicrmw.start
+; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_lshrrev_b32_e32 v2, v6, v3
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX7-NEXT: v_add_f32_e32 v2, v2, v8
+; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX7-NEXT: v_and_b32_e32 v4, v3, v7
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, v6, v2
+; GFX7-NEXT: v_or_b32_e32 v2, v4, v2
+; GFX7-NEXT: v_mov_b32_e32 v5, v3
+; GFX7-NEXT: v_mov_b32_e32 v4, v2
+; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: buffer_wbinvl1
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT: v_mov_b32_e32 v3, v4
+; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
+; GFX7-NEXT: s_cbranch_execnz .LBB55_1
+; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, v6, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX6-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0
+; GFX6-NEXT: s_mov_b32 s6, 0
+; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX6-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX6-NEXT: s_mov_b32 s7, 0xf000
+; GFX6-NEXT: s_mov_b32 s4, s6
+; GFX6-NEXT: s_mov_b32 s5, s6
+; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
+; GFX6-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX6-NEXT: v_lshlrev_b32_e32 v6, 3, v4
+; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v6
+; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GFX6-NEXT: v_not_b32_e32 v7, v4
+; GFX6-NEXT: s_mov_b64 s[8:9], 0
+; GFX6-NEXT: v_and_b32_e32 v8, 0xffff0000, v2
+; GFX6-NEXT: .LBB55_1: ; %atomicrmw.start
+; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_lshrrev_b32_e32 v2, v6, v3
+; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX6-NEXT: v_add_f32_e32 v2, v2, v8
+; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_and_b32_e32 v4, v3, v7
+; GFX6-NEXT: v_lshlrev_b32_e32 v2, v6, v2
+; GFX6-NEXT: v_or_b32_e32 v2, v4, v2
+; GFX6-NEXT: v_mov_b32_e32 v5, v3
+; GFX6-NEXT: v_mov_b32_e32 v4, v2
+; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: buffer_wbinvl1
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT: v_mov_b32_e32 v3, v4
+; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
+; GFX6-NEXT: s_cbranch_execnz .LBB55_1
+; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
+; GFX6-NEXT: v_lshrrev_b32_e32 v0, v6, v4
+; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: s_setpc_b64 s[30:31]
+ %gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 1023
+ %result = atomicrmw fadd ptr addrspace(1) %gep, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+ ret bfloat %result
+}
+
+define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
+; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX12-NEXT: s_mov_b32 s0, 0
+; GFX12-NEXT: global_load_b32 v5, v[0:1], off
+; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_not_b32_e32 v4, v4
+; GFX12-NEXT: .LBB56_1: ; %atomicrmw.start
+; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: v_mov_b32_e32 v6, v5
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX12-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_add_f32_e32 v5, v5, v2
+; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX12-NEXT: s_wait_storecnt 0x0
+; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_cbranch_execnz .LBB56_1
+; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX940: ; %bb.0:
+; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT: s_movk_i32 s0, 0xf800
+; GFX940-NEXT: s_mov_b32 s1, -1
+; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
+; GFX940-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX940-NEXT: v_mov_b32_e32 v1, v5
+; GFX940-NEXT: global_load_dword v5, v[0:1], off
+; GFX940-NEXT: v_and_b32_e32 v3, 3, v4
+; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX940-NEXT: s_mov_b32 s0, 0xffff
+; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0
+; GFX940-NEXT: v_not_b32_e32 v4, v4
+; GFX940-NEXT: s_mov_b64 s[0:1], 0
+; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX940-NEXT: s_movk_i32 s2, 0x7fff
+; GFX940-NEXT: .LBB56_1: ; %atomicrmw.start
+; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX940-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NEXT: v_mov_b32_e32 v7, v5
+; GFX940-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX940-NEXT: s_nop 0
+; GFX940-NEXT: v_add_f32_e32 v5, v5, v2
+; GFX940-NEXT: v_bfe_u32 v6, v5, 16, 1
+; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX940-NEXT: v_add3_u32 v6, v6, v5, s2
+; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX940-NEXT: s_nop 1
+; GFX940-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc
+; GFX940-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX940-NEXT: v_and_or_b32 v6, v7, v4, v5
+; GFX940-NEXT: buffer_wbl2 sc1
+; GFX940-NEXT: global_atomic_cmpswap v5, v[0:1], v[6:7], off sc0
+; GFX940-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NEXT: buffer_inv sc1
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7
+; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX940-NEXT: s_cbranch_execnz .LBB56_1
+; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX940-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX940-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
+; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX11-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX11-NEXT: s_mov_b32 s0, 0
+; GFX11-NEXT: global_load_b32 v5, v[0:1], off
+; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_not_b32_e32 v4, v4
+; GFX11-NEXT: .p2align 6
+; GFX11-NEXT: .LBB56_1: ; %atomicrmw.start
+; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_mov_b32_e32 v6, v5
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_add_f32_e32 v5, v5, v2
+; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: buffer_gl1_inv
+; GFX11-NEXT: buffer_gl0_inv
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT: s_cbranch_execnz .LBB56_1
+; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
+; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX10-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX10-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX10-NEXT: s_mov_b32 s4, 0
+; GFX10-NEXT: global_load_dword v5, v[0:1], off
+; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX10-NEXT: v_not_b32_e32 v4, v4
+; GFX10-NEXT: .LBB56_1: ; %atomicrmw.start
+; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_mov_b32_e32 v6, v5
+; GFX10-NEXT: v_lshrrev_b32_sdwa v5, v3, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX10-NEXT: v_add_f32_e32 v5, v5, v2
+; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX10-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: global_atomic_cmpswap v5, v[0:1], v[5:6], off glc
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: buffer_gl1_inv
+; GFX10-NEXT: buffer_gl0_inv
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT: s_cbranch_execnz .LBB56_1
+; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX90A: ; %bb.0:
+; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
+; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX90A-NEXT: global_load_dword v5, v[0:1], off
+; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX90A-NEXT: s_mov_b32 s4, 0xffff
+; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4
+; GFX90A-NEXT: v_not_b32_e32 v4, v4
+; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX90A-NEXT: s_movk_i32 s6, 0x7fff
+; GFX90A-NEXT: .LBB56_1: ; %atomicrmw.start
+; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v7, v5
+; GFX90A-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX90A-NEXT: v_add_f32_e32 v5, v5, v2
+; GFX90A-NEXT: v_bfe_u32 v6, v5, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX90A-NEXT: v_add3_u32 v6, v6, v5, s6
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX90A-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc
+; GFX90A-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX90A-NEXT: v_and_or_b32 v6, v7, v4, v5
+; GFX90A-NEXT: global_atomic_cmpswap v5, v[0:1], v[6:7], off glc
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: buffer_wbinvl1
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7
+; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: s_cbranch_execnz .LBB56_1
+; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX90A-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX908: ; %bb.0:
+; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
+; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX908-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX908-NEXT: global_load_dword v5, v[0:1], off
+; GFX908-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX908-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX908-NEXT: s_mov_b32 s4, 0xffff
+; GFX908-NEXT: v_lshlrev_b32_e64 v4, v3, s4
+; GFX908-NEXT: v_not_b32_e32 v4, v4
+; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX908-NEXT: s_movk_i32 s6, 0x7fff
+; GFX908-NEXT: .LBB56_1: ; %atomicrmw.start
+; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: v_mov_b32_e32 v6, v5
+; GFX908-NEXT: v_lshrrev_b32_sdwa v5, v3, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX908-NEXT: v_add_f32_e32 v5, v5, v2
+; GFX908-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX908-NEXT: v_add3_u32 v7, v7, v5, s6
+; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX908-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc
+; GFX908-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX908-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX908-NEXT: global_atomic_cmpswap v5, v[0:1], v[5:6], off glc
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: buffer_wbinvl1
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6
+; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX908-NEXT: s_cbranch_execnz .LBB56_1
+; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX908-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0xfffff800, v0
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc
+; GFX8-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX8-NEXT: flat_load_dword v5, v[0:1]
+; GFX8-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX8-NEXT: s_mov_b32 s4, 0xffff
+; GFX8-NEXT: v_lshlrev_b32_e64 v4, v3, s4
+; GFX8-NEXT: v_not_b32_e32 v4, v4
+; GFX8-NEXT: s_mov_b64 s[4:5], 0
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX8-NEXT: .LBB56_1: ; %atomicrmw.start
+; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v6, v5
+; GFX8-NEXT: v_lshrrev_b32_sdwa v5, v3, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_add_f32_e32 v5, v5, v2
+; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5
+; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8
+; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX8-NEXT: v_and_b32_e32 v7, v6, v4
+; GFX8-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT: v_or_b32_e32 v5, v7, v5
+; GFX8-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: buffer_wbinvl1
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6
+; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX8-NEXT: s_cbranch_execnz .LBB56_1
+; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0xfffff800, v0
+; GFX7-NEXT: s_mov_b32 s6, 0
+; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc
+; GFX7-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX7-NEXT: s_mov_b32 s7, 0xf000
+; GFX7-NEXT: s_mov_b32 s4, s6
+; GFX7-NEXT: s_mov_b32 s5, s6
+; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v6, 3, v4
+; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v6
+; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT: v_not_b32_e32 v7, v4
+; GFX7-NEXT: s_mov_b64 s[8:9], 0
+; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v2
+; GFX7-NEXT: .LBB56_1: ; %atomicrmw.start
+; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_lshrrev_b32_e32 v2, v6, v3
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX7-NEXT: v_add_f32_e32 v2, v2, v8
+; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX7-NEXT: v_and_b32_e32 v4, v3, v7
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, v6, v2
+; GFX7-NEXT: v_or_b32_e32 v2, v4, v2
+; GFX7-NEXT: v_mov_b32_e32 v5, v3
+; GFX7-NEXT: v_mov_b32_e32 v4, v2
+; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: buffer_wbinvl1
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT: v_mov_b32_e32 v3, v4
+; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
+; GFX7-NEXT: s_cbranch_execnz .LBB56_1
+; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, v6, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX6-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0xfffff800, v0
+; GFX6-NEXT: s_mov_b32 s6, 0
+; GFX6-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc
+; GFX6-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX6-NEXT: s_mov_b32 s7, 0xf000
+; GFX6-NEXT: s_mov_b32 s4, s6
+; GFX6-NEXT: s_mov_b32 s5, s6
+; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
+; GFX6-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX6-NEXT: v_lshlrev_b32_e32 v6, 3, v4
+; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v6
+; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GFX6-NEXT: v_not_b32_e32 v7, v4
+; GFX6-NEXT: s_mov_b64 s[8:9], 0
+; GFX6-NEXT: v_and_b32_e32 v8, 0xffff0000, v2
+; GFX6-NEXT: .LBB56_1: ; %atomicrmw.start
+; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_lshrrev_b32_e32 v2, v6, v3
+; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX6-NEXT: v_add_f32_e32 v2, v2, v8
+; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_and_b32_e32 v4, v3, v7
+; GFX6-NEXT: v_lshlrev_b32_e32 v2, v6, v2
+; GFX6-NEXT: v_or_b32_e32 v2, v4, v2
+; GFX6-NEXT: v_mov_b32_e32 v5, v3
+; GFX6-NEXT: v_mov_b32_e32 v4, v2
+; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: buffer_wbinvl1
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT: v_mov_b32_e32 v3, v4
+; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
+; GFX6-NEXT: s_cbranch_execnz .LBB56_1
+; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
+; GFX6-NEXT: v_lshrrev_b32_e32 v0, v6, v4
+; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: s_setpc_b64 s[30:31]
+ %gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 -1024
+ %result = atomicrmw fadd ptr addrspace(1) %gep, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+ ret bfloat %result
+ }
+
+define void @global_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2
+; GFX12-NEXT: s_mov_b32 s0, 0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX12-NEXT: global_load_b32 v4, v[0:1], off
+; GFX12-NEXT: v_lshlrev_b32_e32 v5, 3, v3
+; GFX12-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_not_b32_e32 v6, v3
+; GFX12-NEXT: .LBB57_1: ; %atomicrmw.start
+; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: v_lshrrev_b32_e32 v3, v5, v4
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX12-NEXT: v_add_f32_e32 v3, v3, v2
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v3
+; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc_lo
+; GFX12-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_lshlrev_b32_e32 v3, v5, v3
+; GFX12-NEXT: v_and_or_b32 v3, v4, v6, v3
+; GFX12-NEXT: s_wait_storecnt 0x0
+; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_cbranch_execnz .LBB57_1
+; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: global_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory:
+; GFX940: ; %bb.0:
+; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT: v_mov_b32_e32 v3, v0
+; GFX940-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX940-NEXT: global_load_dword v5, v[0:1], off
+; GFX940-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX940-NEXT: s_mov_b32 s0, 0xffff
+; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0
+; GFX940-NEXT: v_not_b32_e32 v6, v4
+; GFX940-NEXT: s_mov_b64 s[0:1], 0
+; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX940-NEXT: s_movk_i32 s2, 0x7fff
+; GFX940-NEXT: .LBB57_1: ; %atomicrmw.start
+; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX940-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NEXT: v_lshrrev_b32_sdwa v4, v3, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX940-NEXT: s_nop 0
+; GFX940-NEXT: v_add_f32_e32 v4, v4, v2
+; GFX940-NEXT: v_bfe_u32 v7, v4, 16, 1
+; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v4
+; GFX940-NEXT: v_add3_u32 v7, v7, v4, s2
+; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
+; GFX940-NEXT: s_nop 1
+; GFX940-NEXT: v_cndmask_b32_e32 v4, v7, v8, vcc
+; GFX940-NEXT: v_lshlrev_b32_sdwa v4, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX940-NEXT: v_and_or_b32 v4, v5, v6, v4
+; GFX940-NEXT: buffer_wbl2 sc1
+; GFX940-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off sc0
+; GFX940-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NEXT: buffer_inv sc1
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5
+; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX940-NEXT: v_mov_b32_e32 v5, v4
+; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX940-NEXT: s_cbranch_execnz .LBB57_1
+; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX940-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX940-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: global_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2
+; GFX11-NEXT: s_mov_b32 s0, 0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX11-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX11-NEXT: global_load_b32 v4, v[0:1], off
+; GFX11-NEXT: v_lshlrev_b32_e32 v5, 3, v3
+; GFX11-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_not_b32_e32 v6, v3
+; GFX11-NEXT: .p2align 6
+; GFX11-NEXT: .LBB57_1: ; %atomicrmw.start
+; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_lshrrev_b32_e32 v3, v5, v4
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-NEXT: v_add_f32_e32 v3, v3, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v3
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc_lo
+; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_lshlrev_b32_e32 v3, v5, v3
+; GFX11-NEXT: v_and_or_b32 v3, v4, v6, v3
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: buffer_gl1_inv
+; GFX11-NEXT: buffer_gl0_inv
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT: s_cbranch_execnz .LBB57_1
+; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: global_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_mov_b32_e32 v3, v0
+; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX10-NEXT: s_mov_b32 s4, 0
+; GFX10-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX10-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX10-NEXT: global_load_dword v4, v[0:1], off
+; GFX10-NEXT: v_lshlrev_b32_e32 v5, 3, v3
+; GFX10-NEXT: v_lshlrev_b32_e64 v3, v5, 0xffff
+; GFX10-NEXT: v_not_b32_e32 v6, v3
+; GFX10-NEXT: .LBB57_1: ; %atomicrmw.start
+; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_lshrrev_b32_sdwa v3, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX10-NEXT: v_add_f32_e32 v3, v3, v2
+; GFX10-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v3
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX10-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX10-NEXT: v_and_or_b32 v3, v4, v6, v3
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: buffer_gl1_inv
+; GFX10-NEXT: buffer_gl0_inv
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX10-NEXT: v_mov_b32_e32 v4, v3
+; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT: s_cbranch_execnz .LBB57_1
+; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: global_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory:
+; GFX90A: ; %bb.0:
+; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v3, v0
+; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX90A-NEXT: global_load_dword v5, v[0:1], off
+; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX90A-NEXT: s_mov_b32 s4, 0xffff
+; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4
+; GFX90A-NEXT: v_not_b32_e32 v6, v4
+; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX90A-NEXT: s_movk_i32 s6, 0x7fff
+; GFX90A-NEXT: .LBB57_1: ; %atomicrmw.start
+; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: v_lshrrev_b32_sdwa v4, v3, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX90A-NEXT: v_add_f32_e32 v4, v4, v2
+; GFX90A-NEXT: v_bfe_u32 v7, v4, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v4
+; GFX90A-NEXT: v_add3_u32 v7, v7, v4, s6
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
+; GFX90A-NEXT: v_cndmask_b32_e32 v4, v7, v8, vcc
+; GFX90A-NEXT: v_lshlrev_b32_sdwa v4, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX90A-NEXT: v_and_or_b32 v4, v5, v6, v4
+; GFX90A-NEXT: global_atomic_cmpswap v4, v[0:1], v[4:5], off glc
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: buffer_wbinvl1
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5
+; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v5, v4
+; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: s_cbranch_execnz .LBB57_1
+; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: global_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory:
+; GFX908: ; %bb.0:
+; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT: v_mov_b32_e32 v3, v0
+; GFX908-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX908-NEXT: global_load_dword v4, v[0:1], off
+; GFX908-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX908-NEXT: v_lshlrev_b32_e32 v5, 3, v3
+; GFX908-NEXT: s_mov_b32 s4, 0xffff
+; GFX908-NEXT: v_lshlrev_b32_e64 v3, v5, s4
+; GFX908-NEXT: v_not_b32_e32 v6, v3
+; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX908-NEXT: s_movk_i32 s6, 0x7fff
+; GFX908-NEXT: .LBB57_1: ; %atomicrmw.start
+; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: v_lshrrev_b32_sdwa v3, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX908-NEXT: v_add_f32_e32 v3, v3, v2
+; GFX908-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v3
+; GFX908-NEXT: v_add3_u32 v7, v7, v3, s6
+; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; GFX908-NEXT: v_cndmask_b32_e32 v3, v7, v8, vcc
+; GFX908-NEXT: v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX908-NEXT: v_and_or_b32 v3, v4, v6, v3
+; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: buffer_wbinvl1
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT: v_mov_b32_e32 v4, v3
+; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX908-NEXT: s_cbranch_execnz .LBB57_1
+; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: global_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v3, v0
+; GFX8-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX8-NEXT: flat_load_dword v4, v[0:1]
+; GFX8-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX8-NEXT: v_lshlrev_b32_e32 v5, 3, v3
+; GFX8-NEXT: s_mov_b32 s4, 0xffff
+; GFX8-NEXT: v_lshlrev_b32_e64 v3, v5, s4
+; GFX8-NEXT: v_not_b32_e32 v6, v3
+; GFX8-NEXT: s_mov_b64 s[4:5], 0
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX8-NEXT: .LBB57_1: ; %atomicrmw.start
+; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_lshrrev_b32_sdwa v3, v5, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_add_f32_e32 v3, v3, v2
+; GFX8-NEXT: v_bfe_u32 v8, v3, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v3
+; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8
+; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; GFX8-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc
+; GFX8-NEXT: v_and_b32_e32 v7, v4, v6
+; GFX8-NEXT: v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT: v_or_b32_e32 v3, v7, v3
+; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: buffer_wbinvl1
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT: v_mov_b32_e32 v4, v3
+; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX8-NEXT: s_cbranch_execnz .LBB57_1
+; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: global_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_mov_b32_e32 v3, v0
+; GFX7-NEXT: s_mov_b32 s6, 0
+; GFX7-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX7-NEXT: s_mov_b32 s7, 0xf000
+; GFX7-NEXT: s_mov_b32 s4, s6
+; GFX7-NEXT: s_mov_b32 s5, s6
+; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX7-NEXT: v_lshlrev_b32_e32 v5, 3, v3
+; GFX7-NEXT: v_lshl_b32_e32 v3, 0xffff, v5
+; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT: v_not_b32_e32 v6, v3
+; GFX7-NEXT: s_mov_b64 s[8:9], 0
+; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT: .LBB57_1: ; %atomicrmw.start
+; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, v5, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_add_f32_e32 v3, v3, v2
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_and_b32_e32 v7, v4, v6
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, v5, v3
+; GFX7-NEXT: v_or_b32_e32 v3, v7, v3
+; GFX7-NEXT: v_mov_b32_e32 v8, v4
+; GFX7-NEXT: v_mov_b32_e32 v7, v3
+; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: buffer_wbinvl1
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v4
+; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT: v_mov_b32_e32 v4, v7
+; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
+; GFX7-NEXT: s_cbranch_execnz .LBB57_1
+; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX6-LABEL: global_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT: v_mov_b32_e32 v3, v0
+; GFX6-NEXT: s_mov_b32 s6, 0
+; GFX6-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX6-NEXT: s_mov_b32 s7, 0xf000
+; GFX6-NEXT: s_mov_b32 s4, s6
+; GFX6-NEXT: s_mov_b32 s5, s6
+; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64
+; GFX6-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX6-NEXT: v_lshlrev_b32_e32 v5, 3, v3
+; GFX6-NEXT: v_lshl_b32_e32 v3, 0xffff, v5
+; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GFX6-NEXT: v_not_b32_e32 v6, v3
+; GFX6-NEXT: s_mov_b64 s[8:9], 0
+; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX6-NEXT: .LBB57_1: ; %atomicrmw.start
+; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_lshrrev_b32_e32 v3, v5, v4
+; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX6-NEXT: v_add_f32_e32 v3, v3, v2
+; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_and_b32_e32 v7, v4, v6
+; GFX6-NEXT: v_lshlrev_b32_e32 v3, v5, v3
+; GFX6-NEXT: v_or_b32_e32 v3, v7, v3
+; GFX6-NEXT: v_mov_b32_e32 v8, v4
+; GFX6-NEXT: v_mov_b32_e32 v7, v3
+; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: buffer_wbinvl1
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v4
+; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT: v_mov_b32_e32 v4, v7
+; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
+; GFX6-NEXT: s_cbranch_execnz .LBB57_1
+; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: s_setpc_b64 s[30:31]
+ %unused = atomicrmw fadd ptr addrspace(1) %ptr, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+ ret void
+}
+
+define void @global_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
+; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX12-NEXT: v_lshlrev_b32_e32 v6, 16, v2
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX12-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX12-NEXT: s_mov_b32 s0, 0
+; GFX12-NEXT: global_load_b32 v3, v[0:1], off
+; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_not_b32_e32 v5, v5
+; GFX12-NEXT: .LBB58_1: ; %atomicrmw.start
+; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX12-NEXT: v_add_f32_e32 v2, v2, v6
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo
+; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2
+; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2
+; GFX12-NEXT: s_wait_storecnt 0x0
+; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_cbranch_execnz .LBB58_1
+; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX940: ; %bb.0:
+; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT: s_mov_b64 s[0:1], 0x7fe
+; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
+; GFX940-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX940-NEXT: v_mov_b32_e32 v1, v5
+; GFX940-NEXT: global_load_dword v3, v[0:1], off
+; GFX940-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX940-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX940-NEXT: s_mov_b32 s0, 0xffff
+; GFX940-NEXT: v_lshlrev_b32_e64 v5, v4, s0
+; GFX940-NEXT: v_not_b32_e32 v5, v5
+; GFX940-NEXT: s_mov_b64 s[0:1], 0
+; GFX940-NEXT: v_lshlrev_b32_e32 v6, 16, v2
+; GFX940-NEXT: s_movk_i32 s2, 0x7fff
+; GFX940-NEXT: .LBB58_1: ; %atomicrmw.start
+; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX940-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX940-NEXT: s_nop 0
+; GFX940-NEXT: v_add_f32_e32 v2, v2, v6
+; GFX940-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX940-NEXT: v_add3_u32 v7, v7, v2, s2
+; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; GFX940-NEXT: s_nop 1
+; GFX940-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc
+; GFX940-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX940-NEXT: v_and_or_b32 v2, v3, v5, v2
+; GFX940-NEXT: buffer_wbl2 sc1
+; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0
+; GFX940-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NEXT: buffer_inv sc1
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX940-NEXT: v_mov_b32_e32 v3, v2
+; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX940-NEXT: s_cbranch_execnz .LBB58_1
+; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX940-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX940-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
+; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX11-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX11-NEXT: s_mov_b32 s0, 0
+; GFX11-NEXT: global_load_b32 v3, v[0:1], off
+; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX11-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_not_b32_e32 v5, v5
+; GFX11-NEXT: .p2align 6
+; GFX11-NEXT: .LBB58_1: ; %atomicrmw.start
+; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-NEXT: v_add_f32_e32 v2, v2, v6
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo
+; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_lshlrev_b32_e32 v2, v4, v2
+; GFX11-NEXT: v_and_or_b32 v2, v3, v5, v2
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: buffer_gl1_inv
+; GFX11-NEXT: buffer_gl0_inv
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-NEXT: v_mov_b32_e32 v3, v2
+; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT: s_cbranch_execnz .LBB58_1
+; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
+; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v2
+; GFX10-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX10-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX10-NEXT: s_mov_b32 s4, 0
+; GFX10-NEXT: global_load_dword v3, v[0:1], off
+; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX10-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
+; GFX10-NEXT: v_not_b32_e32 v5, v5
+; GFX10-NEXT: .LBB58_1: ; %atomicrmw.start
+; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX10-NEXT: v_add_f32_e32 v2, v2, v6
+; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX10-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX10-NEXT: v_and_or_b32 v2, v3, v5, v2
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: buffer_gl1_inv
+; GFX10-NEXT: buffer_gl0_inv
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-NEXT: v_mov_b32_e32 v3, v2
+; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT: s_cbranch_execnz .LBB58_1
+; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX90A: ; %bb.0:
+; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0
+; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX90A-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX90A-NEXT: global_load_dword v3, v[0:1], off
+; GFX90A-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX90A-NEXT: s_mov_b32 s4, 0xffff
+; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v4, s4
+; GFX90A-NEXT: v_not_b32_e32 v5, v5
+; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v2
+; GFX90A-NEXT: s_movk_i32 s6, 0x7fff
+; GFX90A-NEXT: .LBB58_1: ; %atomicrmw.start
+; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX90A-NEXT: v_add_f32_e32 v2, v2, v6
+; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s6
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; GFX90A-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc
+; GFX90A-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX90A-NEXT: v_and_or_b32 v2, v3, v5, v2
+; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: buffer_wbinvl1
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
+; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: s_cbranch_execnz .LBB58_1
+; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX908: ; %bb.0:
+; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0
+; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX908-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX908-NEXT: global_load_dword v3, v[0:1], off
+; GFX908-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX908-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX908-NEXT: s_mov_b32 s4, 0xffff
+; GFX908-NEXT: v_lshlrev_b32_e64 v5, v4, s4
+; GFX908-NEXT: v_not_b32_e32 v5, v5
+; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: v_lshlrev_b32_e32 v6, 16, v2
+; GFX908-NEXT: s_movk_i32 s6, 0x7fff
+; GFX908-NEXT: .LBB58_1: ; %atomicrmw.start
+; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX908-NEXT: v_add_f32_e32 v2, v2, v6
+; GFX908-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX908-NEXT: v_add3_u32 v7, v7, v2, s6
+; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; GFX908-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc
+; GFX908-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX908-NEXT: v_and_or_b32 v2, v3, v5, v2
+; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: buffer_wbinvl1
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT: v_mov_b32_e32 v3, v2
+; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX908-NEXT: s_cbranch_execnz .LBB58_1
+; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fe, v0
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX8-NEXT: flat_load_dword v3, v[0:1]
+; GFX8-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX8-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX8-NEXT: s_mov_b32 s4, 0xffff
+; GFX8-NEXT: v_lshlrev_b32_e64 v5, v4, s4
+; GFX8-NEXT: v_not_b32_e32 v5, v5
+; GFX8-NEXT: s_mov_b64 s[4:5], 0
+; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v2
+; GFX8-NEXT: .LBB58_1: ; %atomicrmw.start
+; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_add_f32_e32 v2, v2, v6
+; GFX8-NEXT: v_bfe_u32 v8, v2, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v2
+; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8
+; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; GFX8-NEXT: v_cndmask_b32_e32 v2, v8, v9, vcc
+; GFX8-NEXT: v_and_b32_e32 v7, v3, v5
+; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT: v_or_b32_e32 v2, v7, v2
+; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: buffer_wbinvl1
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT: v_mov_b32_e32 v3, v2
+; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX8-NEXT: s_cbranch_execnz .LBB58_1
+; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0
+; GFX7-NEXT: s_mov_b32 s6, 0
+; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX7-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX7-NEXT: s_mov_b32 s7, 0xf000
+; GFX7-NEXT: s_mov_b32 s4, s6
+; GFX7-NEXT: s_mov_b32 s5, s6
+; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX7-NEXT: v_lshl_b32_e32 v5, 0xffff, v4
+; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT: v_not_b32_e32 v5, v5
+; GFX7-NEXT: s_mov_b64 s[8:9], 0
+; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
+; GFX7-NEXT: .LBB58_1: ; %atomicrmw.start
+; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX7-NEXT: v_add_f32_e32 v2, v2, v6
+; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX7-NEXT: v_and_b32_e32 v7, v3, v5
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, v4, v2
+; GFX7-NEXT: v_or_b32_e32 v2, v7, v2
+; GFX7-NEXT: v_mov_b32_e32 v8, v3
+; GFX7-NEXT: v_mov_b32_e32 v7, v2
+; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: buffer_wbinvl1
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3
+; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT: v_mov_b32_e32 v3, v7
+; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
+; GFX7-NEXT: s_cbranch_execnz .LBB58_1
+; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX6-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0
+; GFX6-NEXT: s_mov_b32 s6, 0
+; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX6-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX6-NEXT: s_mov_b32 s7, 0xf000
+; GFX6-NEXT: s_mov_b32 s4, s6
+; GFX6-NEXT: s_mov_b32 s5, s6
+; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
+; GFX6-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX6-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX6-NEXT: v_lshl_b32_e32 v5, 0xffff, v4
+; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GFX6-NEXT: v_not_b32_e32 v5, v5
+; GFX6-NEXT: s_mov_b64 s[8:9], 0
+; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
+; GFX6-NEXT: .LBB58_1: ; %atomicrmw.start
+; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX6-NEXT: v_add_f32_e32 v2, v2, v6
+; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_and_b32_e32 v7, v3, v5
+; GFX6-NEXT: v_lshlrev_b32_e32 v2, v4, v2
+; GFX6-NEXT: v_or_b32_e32 v2, v7, v2
+; GFX6-NEXT: v_mov_b32_e32 v8, v3
+; GFX6-NEXT: v_mov_b32_e32 v7, v2
+; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: buffer_wbinvl1
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3
+; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT: v_mov_b32_e32 v3, v7
+; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
+; GFX6-NEXT: s_cbranch_execnz .LBB58_1
+; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: s_setpc_b64 s[30:31]
+ %gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 1023
+ %unused = atomicrmw fadd ptr addrspace(1) %gep, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+ ret void
+}
+
+define void @global_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
+; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX12-NEXT: v_lshlrev_b32_e32 v6, 16, v2
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX12-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX12-NEXT: s_mov_b32 s0, 0
+; GFX12-NEXT: global_load_b32 v3, v[0:1], off
+; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_not_b32_e32 v5, v5
+; GFX12-NEXT: .LBB59_1: ; %atomicrmw.start
+; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX12-NEXT: v_add_f32_e32 v2, v2, v6
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo
+; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2
+; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2
+; GFX12-NEXT: s_wait_storecnt 0x0
+; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_cbranch_execnz .LBB59_1
+; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX940: ; %bb.0:
+; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT: s_movk_i32 s0, 0xf800
+; GFX940-NEXT: s_mov_b32 s1, -1
+; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
+; GFX940-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX940-NEXT: v_mov_b32_e32 v1, v5
+; GFX940-NEXT: global_load_dword v3, v[0:1], off
+; GFX940-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX940-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX940-NEXT: s_mov_b32 s0, 0xffff
+; GFX940-NEXT: v_lshlrev_b32_e64 v5, v4, s0
+; GFX940-NEXT: v_not_b32_e32 v5, v5
+; GFX940-NEXT: s_mov_b64 s[0:1], 0
+; GFX940-NEXT: v_lshlrev_b32_e32 v6, 16, v2
+; GFX940-NEXT: s_movk_i32 s2, 0x7fff
+; GFX940-NEXT: .LBB59_1: ; %atomicrmw.start
+; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX940-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX940-NEXT: s_nop 0
+; GFX940-NEXT: v_add_f32_e32 v2, v2, v6
+; GFX940-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX940-NEXT: v_add3_u32 v7, v7, v2, s2
+; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; GFX940-NEXT: s_nop 1
+; GFX940-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc
+; GFX940-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX940-NEXT: v_and_or_b32 v2, v3, v5, v2
+; GFX940-NEXT: buffer_wbl2 sc1
+; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0
+; GFX940-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NEXT: buffer_inv sc1
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX940-NEXT: v_mov_b32_e32 v3, v2
+; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX940-NEXT: s_cbranch_execnz .LBB59_1
+; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX940-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX940-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
+; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX11-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX11-NEXT: s_mov_b32 s0, 0
+; GFX11-NEXT: global_load_b32 v3, v[0:1], off
+; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX11-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_not_b32_e32 v5, v5
+; GFX11-NEXT: .p2align 6
+; GFX11-NEXT: .LBB59_1: ; %atomicrmw.start
+; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-NEXT: v_add_f32_e32 v2, v2, v6
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo
+; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_lshlrev_b32_e32 v2, v4, v2
+; GFX11-NEXT: v_and_or_b32 v2, v3, v5, v2
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: buffer_gl1_inv
+; GFX11-NEXT: buffer_gl0_inv
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-NEXT: v_mov_b32_e32 v3, v2
+; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT: s_cbranch_execnz .LBB59_1
+; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v0
+; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v2
+; GFX10-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX10-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX10-NEXT: s_mov_b32 s4, 0
+; GFX10-NEXT: global_load_dword v3, v[0:1], off
+; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX10-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
+; GFX10-NEXT: v_not_b32_e32 v5, v5
+; GFX10-NEXT: .LBB59_1: ; %atomicrmw.start
+; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX10-NEXT: v_add_f32_e32 v2, v2, v6
+; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX10-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX10-NEXT: v_and_or_b32 v2, v3, v5, v2
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: buffer_gl1_inv
+; GFX10-NEXT: buffer_gl0_inv
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-NEXT: v_mov_b32_e32 v3, v2
+; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT: s_cbranch_execnz .LBB59_1
+; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX90A: ; %bb.0:
+; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
+; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX90A-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX90A-NEXT: global_load_dword v3, v[0:1], off
+; GFX90A-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX90A-NEXT: s_mov_b32 s4, 0xffff
+; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v4, s4
+; GFX90A-NEXT: v_not_b32_e32 v5, v5
+; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v2
+; GFX90A-NEXT: s_movk_i32 s6, 0x7fff
+; GFX90A-NEXT: .LBB59_1: ; %atomicrmw.start
+; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX90A-NEXT: v_add_f32_e32 v2, v2, v6
+; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s6
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; GFX90A-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc
+; GFX90A-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX90A-NEXT: v_and_or_b32 v2, v3, v5, v2
+; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: buffer_wbinvl1
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
+; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: s_cbranch_execnz .LBB59_1
+; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX908: ; %bb.0:
+; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
+; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX908-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX908-NEXT: global_load_dword v3, v[0:1], off
+; GFX908-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX908-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX908-NEXT: s_mov_b32 s4, 0xffff
+; GFX908-NEXT: v_lshlrev_b32_e64 v5, v4, s4
+; GFX908-NEXT: v_not_b32_e32 v5, v5
+; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: v_lshlrev_b32_e32 v6, 16, v2
+; GFX908-NEXT: s_movk_i32 s6, 0x7fff
+; GFX908-NEXT: .LBB59_1: ; %atomicrmw.start
+; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX908-NEXT: v_add_f32_e32 v2, v2, v6
+; GFX908-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX908-NEXT: v_add3_u32 v7, v7, v2, s6
+; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; GFX908-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc
+; GFX908-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX908-NEXT: v_and_or_b32 v2, v3, v5, v2
+; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: buffer_wbinvl1
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT: v_mov_b32_e32 v3, v2
+; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX908-NEXT: s_cbranch_execnz .LBB59_1
+; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0xfffff800, v0
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc
+; GFX8-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX8-NEXT: flat_load_dword v3, v[0:1]
+; GFX8-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX8-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX8-NEXT: s_mov_b32 s4, 0xffff
+; GFX8-NEXT: v_lshlrev_b32_e64 v5, v4, s4
+; GFX8-NEXT: v_not_b32_e32 v5, v5
+; GFX8-NEXT: s_mov_b64 s[4:5], 0
+; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v2
+; GFX8-NEXT: .LBB59_1: ; %atomicrmw.start
+; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_add_f32_e32 v2, v2, v6
+; GFX8-NEXT: v_bfe_u32 v8, v2, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v2
+; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8
+; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; GFX8-NEXT: v_cndmask_b32_e32 v2, v8, v9, vcc
+; GFX8-NEXT: v_and_b32_e32 v7, v3, v5
+; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT: v_or_b32_e32 v2, v7, v2
+; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: buffer_wbinvl1
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT: v_mov_b32_e32 v3, v2
+; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX8-NEXT: s_cbranch_execnz .LBB59_1
+; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0xfffff800, v0
+; GFX7-NEXT: s_mov_b32 s6, 0
+; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc
+; GFX7-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX7-NEXT: s_mov_b32 s7, 0xf000
+; GFX7-NEXT: s_mov_b32 s4, s6
+; GFX7-NEXT: s_mov_b32 s5, s6
+; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX7-NEXT: v_lshl_b32_e32 v5, 0xffff, v4
+; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT: v_not_b32_e32 v5, v5
+; GFX7-NEXT: s_mov_b64 s[8:9], 0
+; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
+; GFX7-NEXT: .LBB59_1: ; %atomicrmw.start
+; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX7-NEXT: v_add_f32_e32 v2, v2, v6
+; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX7-NEXT: v_and_b32_e32 v7, v3, v5
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, v4, v2
+; GFX7-NEXT: v_or_b32_e32 v2, v7, v2
+; GFX7-NEXT: v_mov_b32_e32 v8, v3
+; GFX7-NEXT: v_mov_b32_e32 v7, v2
+; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: buffer_wbinvl1
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3
+; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT: v_mov_b32_e32 v3, v7
+; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
+; GFX7-NEXT: s_cbranch_execnz .LBB59_1
+; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX6-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0xfffff800, v0
+; GFX6-NEXT: s_mov_b32 s6, 0
+; GFX6-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc
+; GFX6-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX6-NEXT: s_mov_b32 s7, 0xf000
+; GFX6-NEXT: s_mov_b32 s4, s6
+; GFX6-NEXT: s_mov_b32 s5, s6
+; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
+; GFX6-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX6-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX6-NEXT: v_lshl_b32_e32 v5, 0xffff, v4
+; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GFX6-NEXT: v_not_b32_e32 v5, v5
+; GFX6-NEXT: s_mov_b64 s[8:9], 0
+; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
+; GFX6-NEXT: .LBB59_1: ; %atomicrmw.start
+; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX6-NEXT: v_add_f32_e32 v2, v2, v6
+; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_and_b32_e32 v7, v3, v5
+; GFX6-NEXT: v_lshlrev_b32_e32 v2, v4, v2
+; GFX6-NEXT: v_or_b32_e32 v2, v7, v2
+; GFX6-NEXT: v_mov_b32_e32 v8, v3
+; GFX6-NEXT: v_mov_b32_e32 v7, v2
+; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: buffer_wbinvl1
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3
+; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT: v_mov_b32_e32 v3, v7
+; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
+; GFX6-NEXT: s_cbranch_execnz .LBB59_1
+; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: s_setpc_b64 s[30:31]
+ %gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 -1024
+ %unused = atomicrmw fadd ptr addrspace(1) %gep, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+ ret void
+}
+
+define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2046
+; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX12-NEXT: s_mov_b32 s0, 0
+; GFX12-NEXT: .LBB60_1: ; %atomicrmw.start
+; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v4
+; GFX12-NEXT: v_add_f32_e32 v3, v3, v2
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-NEXT: v_bfe_u32 v5, v3, 16, 1
+; GFX12-NEXT: v_or_b32_e32 v6, 0x400000, v3
+; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX12-NEXT: v_add3_u32 v5, v5, v3, 0x7fff
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo
+; GFX12-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
+; GFX12-NEXT: s_wait_storecnt 0x0
+; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 th:TH_ATOMIC_RETURN
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_cbranch_execnz .LBB60_1
+; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
+; GFX940: ; %bb.0:
+; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2046
+; GFX940-NEXT: s_mov_b64 s[0:1], 0
+; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX940-NEXT: s_movk_i32 s2, 0x7fff
+; GFX940-NEXT: s_mov_b32 s3, 0xffff0000
+; GFX940-NEXT: .LBB60_1: ; %atomicrmw.start
+; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX940-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NEXT: v_mov_b32_e32 v5, v3
+; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v5
+; GFX940-NEXT: v_add_f32_e32 v3, v3, v2
+; GFX940-NEXT: v_bfe_u32 v4, v3, 16, 1
+; GFX940-NEXT: v_or_b32_e32 v6, 0x400000, v3
+; GFX940-NEXT: v_add3_u32 v4, v4, v3, s2
+; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; GFX940-NEXT: s_nop 1
+; GFX940-NEXT: v_cndmask_b32_e32 v3, v4, v6, vcc
+; GFX940-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX940-NEXT: v_and_or_b32 v4, v5, s3, v3
+; GFX940-NEXT: buffer_wbl2 sc1
+; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2046 sc0
+; GFX940-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NEXT: buffer_inv sc1
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX940-NEXT: s_cbranch_execnz .LBB60_1
+; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX940-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX940-NEXT: v_mov_b32_e32 v0, v3
+; GFX940-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2046
+; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-NEXT: s_mov_b32 s0, 0
+; GFX11-NEXT: .p2align 6
+; GFX11-NEXT: .LBB60_1: ; %atomicrmw.start
+; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v4
+; GFX11-NEXT: v_add_f32_e32 v3, v3, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_bfe_u32 v5, v3, 16, 1
+; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v3
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX11-NEXT: v_add3_u32 v5, v5, v3, 0x7fff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo
+; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2046 glc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: buffer_gl1_inv
+; GFX11-NEXT: buffer_gl0_inv
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT: s_cbranch_execnz .LBB60_1
+; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT: v_mov_b32_e32 v0, v3
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2046
+; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX10-NEXT: s_mov_b32 s4, 0
+; GFX10-NEXT: .LBB60_1: ; %atomicrmw.start
+; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_mov_b32_e32 v4, v3
+; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v4
+; GFX10-NEXT: v_add_f32_e32 v3, v3, v2
+; GFX10-NEXT: v_bfe_u32 v5, v3, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v3
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX10-NEXT: v_add3_u32 v5, v5, v3, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc_lo
+; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX10-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2046 glc
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: buffer_gl1_inv
+; GFX10-NEXT: buffer_gl0_inv
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT: s_cbranch_execnz .LBB60_1
+; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT: v_mov_b32_e32 v0, v3
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
+; GFX90A: ; %bb.0:
+; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2046
+; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX90A-NEXT: s_movk_i32 s6, 0x7fff
+; GFX90A-NEXT: s_mov_b32 s7, 0xffff0000
+; GFX90A-NEXT: .LBB60_1: ; %atomicrmw.start
+; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v5, v3
+; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v5
+; GFX90A-NEXT: v_add_f32_e32 v3, v3, v2
+; GFX90A-NEXT: v_bfe_u32 v4, v3, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v3
+; GFX90A-NEXT: v_add3_u32 v4, v4, v3, s6
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; GFX90A-NEXT: v_cndmask_b32_e32 v3, v4, v6, vcc
+; GFX90A-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX90A-NEXT: v_and_or_b32 v4, v5, s7, v3
+; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:2046 glc
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: buffer_wbinvl1
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: s_cbranch_execnz .LBB60_1
+; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v3
+; GFX90A-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
+; GFX908: ; %bb.0:
+; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2046
+; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX908-NEXT: s_movk_i32 s6, 0x7fff
+; GFX908-NEXT: s_mov_b32 s7, 0xffff0000
+; GFX908-NEXT: .LBB60_1: ; %atomicrmw.start
+; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: v_mov_b32_e32 v4, v3
+; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v4
+; GFX908-NEXT: v_add_f32_e32 v3, v3, v2
+; GFX908-NEXT: v_bfe_u32 v5, v3, 16, 1
+; GFX908-NEXT: v_or_b32_e32 v6, 0x400000, v3
+; GFX908-NEXT: v_add3_u32 v5, v5, v3, s6
+; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; GFX908-NEXT: v_cndmask_b32_e32 v3, v5, v6, vcc
+; GFX908-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX908-NEXT: v_and_or_b32 v3, v4, s7, v3
+; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2046 glc
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: buffer_wbinvl1
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX908-NEXT: s_cbranch_execnz .LBB60_1
+; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT: v_mov_b32_e32 v0, v3
+; GFX908-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fe, v0
+; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc
+; GFX8-NEXT: flat_load_dword v0, v[3:4]
+; GFX8-NEXT: s_mov_b64 s[4:5], 0
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2
+; GFX8-NEXT: .LBB60_1: ; %atomicrmw.start
+; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v6, v0
+; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v6
+; GFX8-NEXT: v_add_f32_e32 v0, v0, v1
+; GFX8-NEXT: v_bfe_u32 v5, v0, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v0
+; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5
+; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v0
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
+; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v6
+; GFX8-NEXT: v_cndmask_b32_e32 v0, v5, v7, vcc
+; GFX8-NEXT: v_or_b32_sdwa v5, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: buffer_wbinvl1
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6
+; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX8-NEXT: s_cbranch_execnz .LBB60_1
+; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: s_mov_b32 s6, 0
+; GFX7-NEXT: s_mov_b32 s7, 0xf000
+; GFX7-NEXT: s_mov_b32 s4, s6
+; GFX7-NEXT: s_mov_b32 s5, s6
+; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046
+; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT: s_mov_b64 s[8:9], 0
+; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
+; GFX7-NEXT: .LBB60_1: ; %atomicrmw.start
+; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX7-NEXT: v_add_f32_e32 v2, v2, v6
+; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v3
+; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX7-NEXT: v_or_b32_e32 v2, v4, v2
+; GFX7-NEXT: v_mov_b32_e32 v5, v3
+; GFX7-NEXT: v_mov_b32_e32 v4, v2
+; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 offset:2046 glc
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: buffer_wbinvl1
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT: v_mov_b32_e32 v3, v4
+; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
+; GFX7-NEXT: s_cbranch_execnz .LBB60_1
+; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v4
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX6-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT: s_mov_b32 s6, 0
+; GFX6-NEXT: s_mov_b32 s7, 0xf000
+; GFX6-NEXT: s_mov_b32 s4, s6
+; GFX6-NEXT: s_mov_b32 s5, s6
+; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046
+; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GFX6-NEXT: s_mov_b64 s[8:9], 0
+; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
+; GFX6-NEXT: .LBB60_1: ; %atomicrmw.start
+; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX6-NEXT: v_add_f32_e32 v2, v2, v6
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v3
+; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX6-NEXT: v_or_b32_e32 v2, v4, v2
+; GFX6-NEXT: v_mov_b32_e32 v5, v3
+; GFX6-NEXT: v_mov_b32_e32 v4, v2
+; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 offset:2046 glc
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: buffer_wbinvl1
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT: v_mov_b32_e32 v3, v4
+; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
+; GFX6-NEXT: s_cbranch_execnz .LBB60_1
+; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
+; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v4
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: s_setpc_b64 s[30:31]
+ %gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 1023
+ %result = atomicrmw fadd ptr addrspace(1) %gep, bfloat %val syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0
+ ret bfloat %result
+}
+
+define void @global_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:2046
+; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX12-NEXT: s_mov_b32 s0, 0
+; GFX12-NEXT: .LBB61_1: ; %atomicrmw.start
+; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_add_f32_e32 v2, v2, v4
+; GFX12-NEXT: v_bfe_u32 v5, v2, 16, 1
+; GFX12-NEXT: v_or_b32_e32 v6, 0x400000, v2
+; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_add3_u32 v5, v5, v2, 0x7fff
+; GFX12-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc_lo
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX12-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2
+; GFX12-NEXT: s_wait_storecnt 0x0
+; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2046 th:TH_ATOMIC_RETURN
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_cbranch_execnz .LBB61_1
+; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
+; GFX940: ; %bb.0:
+; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:2046
+; GFX940-NEXT: s_mov_b64 s[0:1], 0
+; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX940-NEXT: s_movk_i32 s2, 0x7fff
+; GFX940-NEXT: s_mov_b32 s3, 0xffff0000
+; GFX940-NEXT: .LBB61_1: ; %atomicrmw.start
+; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX940-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX940-NEXT: v_add_f32_e32 v2, v2, v4
+; GFX940-NEXT: v_bfe_u32 v5, v2, 16, 1
+; GFX940-NEXT: v_or_b32_e32 v6, 0x400000, v2
+; GFX940-NEXT: v_add3_u32 v5, v5, v2, s2
+; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; GFX940-NEXT: s_nop 1
+; GFX940-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc
+; GFX940-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX940-NEXT: v_and_or_b32 v2, v3, s3, v2
+; GFX940-NEXT: buffer_wbl2 sc1
+; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2046 sc0
+; GFX940-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NEXT: buffer_inv sc1
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX940-NEXT: v_mov_b32_e32 v3, v2
+; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX940-NEXT: s_cbranch_execnz .LBB61_1
+; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX940-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX940-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2046
+; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-NEXT: s_mov_b32 s0, 0
+; GFX11-NEXT: .p2align 6
+; GFX11-NEXT: .LBB61_1: ; %atomicrmw.start
+; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_add_f32_e32 v2, v2, v4
+; GFX11-NEXT: v_bfe_u32 v5, v2, 16, 1
+; GFX11-NEXT: v_or_b32_e32 v6, 0x400000, v2
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_add3_u32 v5, v5, v2, 0x7fff
+; GFX11-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2046 glc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: buffer_gl1_inv
+; GFX11-NEXT: buffer_gl0_inv
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-NEXT: v_mov_b32_e32 v3, v2
+; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT: s_cbranch_execnz .LBB61_1
+; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2046
+; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX10-NEXT: s_mov_b32 s4, 0
+; GFX10-NEXT: .LBB61_1: ; %atomicrmw.start
+; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX10-NEXT: v_add_f32_e32 v2, v2, v4
+; GFX10-NEXT: v_bfe_u32 v5, v2, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v6, 0x400000, v2
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX10-NEXT: v_add3_u32 v5, v5, v2, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc_lo
+; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX10-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2046 glc
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: buffer_gl1_inv
+; GFX10-NEXT: buffer_gl0_inv
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-NEXT: v_mov_b32_e32 v3, v2
+; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT: s_cbranch_execnz .LBB61_1
+; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
+; GFX90A: ; %bb.0:
+; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2046
+; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX90A-NEXT: s_movk_i32 s6, 0x7fff
+; GFX90A-NEXT: s_mov_b32 s7, 0xffff0000
+; GFX90A-NEXT: .LBB61_1: ; %atomicrmw.start
+; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX90A-NEXT: v_add_f32_e32 v2, v2, v4
+; GFX90A-NEXT: v_bfe_u32 v5, v2, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v2
+; GFX90A-NEXT: v_add3_u32 v5, v5, v2, s6
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; GFX90A-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc
+; GFX90A-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX90A-NEXT: v_and_or_b32 v2, v3, s7, v2
+; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2046 glc
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: buffer_wbinvl1
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
+; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: s_cbranch_execnz .LBB61_1
+; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
+; GFX908: ; %bb.0:
+; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2046
+; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX908-NEXT: s_movk_i32 s6, 0x7fff
+; GFX908-NEXT: s_mov_b32 s7, 0xffff0000
+; GFX908-NEXT: .LBB61_1: ; %atomicrmw.start
+; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX908-NEXT: v_add_f32_e32 v2, v2, v4
+; GFX908-NEXT: v_bfe_u32 v5, v2, 16, 1
+; GFX908-NEXT: v_or_b32_e32 v6, 0x400000, v2
+; GFX908-NEXT: v_add3_u32 v5, v5, v2, s6
+; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; GFX908-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc
+; GFX908-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX908-NEXT: v_and_or_b32 v2, v3, s7, v2
+; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2046 glc
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: buffer_wbinvl1
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT: v_mov_b32_e32 v3, v2
+; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX908-NEXT: s_cbranch_execnz .LBB61_1
+; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fe, v0
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT: flat_load_dword v3, v[0:1]
+; GFX8-NEXT: s_mov_b64 s[4:5], 0
+; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX8-NEXT: .LBB61_1: ; %atomicrmw.start
+; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX8-NEXT: v_add_f32_e32 v2, v2, v4
+; GFX8-NEXT: v_bfe_u32 v6, v2, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v2
+; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6
+; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v2
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
+; GFX8-NEXT: v_cndmask_b32_e32 v2, v6, v7, vcc
+; GFX8-NEXT: v_or_b32_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: buffer_wbinvl1
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT: v_mov_b32_e32 v3, v2
+; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX8-NEXT: s_cbranch_execnz .LBB61_1
+; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: s_mov_b32 s6, 0
+; GFX7-NEXT: s_mov_b32 s7, 0xf000
+; GFX7-NEXT: s_mov_b32 s4, s6
+; GFX7-NEXT: s_mov_b32 s5, s6
+; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046
+; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT: s_mov_b64 s[8:9], 0
+; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
+; GFX7-NEXT: .LBB61_1: ; %atomicrmw.start
+; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX7-NEXT: v_add_f32_e32 v2, v2, v4
+; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
+; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX7-NEXT: v_or_b32_e32 v2, v5, v2
+; GFX7-NEXT: v_mov_b32_e32 v6, v3
+; GFX7-NEXT: v_mov_b32_e32 v5, v2
+; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2046 glc
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: buffer_wbinvl1
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v3
+; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT: v_mov_b32_e32 v3, v5
+; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
+; GFX7-NEXT: s_cbranch_execnz .LBB61_1
+; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX6-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT: s_mov_b32 s6, 0
+; GFX6-NEXT: s_mov_b32 s7, 0xf000
+; GFX6-NEXT: s_mov_b32 s4, s6
+; GFX6-NEXT: s_mov_b32 s5, s6
+; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046
+; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GFX6-NEXT: s_mov_b64 s[8:9], 0
+; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
+; GFX6-NEXT: .LBB61_1: ; %atomicrmw.start
+; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX6-NEXT: v_add_f32_e32 v2, v2, v4
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v3
+; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX6-NEXT: v_or_b32_e32 v2, v5, v2
+; GFX6-NEXT: v_mov_b32_e32 v6, v3
+; GFX6-NEXT: v_mov_b32_e32 v5, v2
+; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v[0:1], s[4:7], 0 addr64 offset:2046 glc
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: buffer_wbinvl1
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v3
+; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT: v_mov_b32_e32 v3, v5
+; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
+; GFX6-NEXT: s_cbranch_execnz .LBB61_1
+; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: s_setpc_b64 s[30:31]
+ %gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 1023
+ %unused = atomicrmw fadd ptr addrspace(1) %gep, bfloat %val syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0
+ ret void
+}
+
+define bfloat @global_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 {
+; GFX12-LABEL: global_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX12-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX12-NEXT: s_mov_b32 s0, 0
+; GFX12-NEXT: global_load_b32 v5, v[0:1], off
+; GFX12-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX12-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_not_b32_e32 v4, v4
+; GFX12-NEXT: .LBB62_1: ; %atomicrmw.start
+; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: v_mov_b32_e32 v6, v5
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX12-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_add_f32_e32 v5, v5, v2
+; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX12-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX12-NEXT: s_wait_storecnt 0x0
+; GFX12-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: global_inv scope:SCOPE_SYS
+; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_cbranch_execnz .LBB62_1
+; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: global_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX940: ; %bb.0:
+; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT: s_mov_b64 s[0:1], 0x7fe
+; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
+; GFX940-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX940-NEXT: v_mov_b32_e32 v1, v5
+; GFX940-NEXT: global_load_dword v5, v[0:1], off
+; GFX940-NEXT: v_and_b32_e32 v3, 3, v4
+; GFX940-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX940-NEXT: s_mov_b32 s0, 0xffff
+; GFX940-NEXT: v_lshlrev_b32_e64 v4, v3, s0
+; GFX940-NEXT: v_not_b32_e32 v4, v4
+; GFX940-NEXT: s_mov_b64 s[0:1], 0
+; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX940-NEXT: s_movk_i32 s2, 0x7fff
+; GFX940-NEXT: .LBB62_1: ; %atomicrmw.start
+; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX940-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NEXT: v_mov_b32_e32 v7, v5
+; GFX940-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX940-NEXT: s_nop 0
+; GFX940-NEXT: v_add_f32_e32 v5, v5, v2
+; GFX940-NEXT: v_bfe_u32 v6, v5, 16, 1
+; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX940-NEXT: v_add3_u32 v6, v6, v5, s2
+; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX940-NEXT: s_nop 1
+; GFX940-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc
+; GFX940-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX940-NEXT: v_and_or_b32 v6, v7, v4, v5
+; GFX940-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-NEXT: global_atomic_cmpswap v5, v[0:1], v[6:7], off sc0 sc1
+; GFX940-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NEXT: buffer_inv sc0 sc1
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7
+; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX940-NEXT: s_cbranch_execnz .LBB62_1
+; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX940-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX940-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX940-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: global_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX11-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX11-NEXT: s_mov_b32 s0, 0
+; GFX11-NEXT: global_load_b32 v5, v[0:1], off
+; GFX11-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX11-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_not_b32_e32 v4, v4
+; GFX11-NEXT: .p2align 6
+; GFX11-NEXT: .LBB62_1: ; %atomicrmw.start
+; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_mov_b32_e32 v6, v5
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_add_f32_e32 v5, v5, v2
+; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX11-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[5:6], off glc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: buffer_gl1_inv
+; GFX11-NEXT: buffer_gl0_inv
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT: s_cbranch_execnz .LBB62_1
+; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: global_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_add_co_u32 v3, vcc_lo, 0x7fe, v0
+; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX10-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX10-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX10-NEXT: s_mov_b32 s4, 0
+; GFX10-NEXT: global_load_dword v5, v[0:1], off
+; GFX10-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX10-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX10-NEXT: v_not_b32_e32 v4, v4
+; GFX10-NEXT: .LBB62_1: ; %atomicrmw.start
+; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_mov_b32_e32 v6, v5
+; GFX10-NEXT: v_lshrrev_b32_sdwa v5, v3, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX10-NEXT: v_add_f32_e32 v5, v5, v2
+; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX10-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: global_atomic_cmpswap v5, v[0:1], v[5:6], off glc
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: buffer_gl1_inv
+; GFX10-NEXT: buffer_gl0_inv
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v6
+; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT: s_cbranch_execnz .LBB62_1
+; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: global_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX90A: ; %bb.0:
+; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0
+; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX90A-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX90A-NEXT: global_load_dword v5, v[0:1], off
+; GFX90A-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX90A-NEXT: s_mov_b32 s4, 0xffff
+; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v3, s4
+; GFX90A-NEXT: v_not_b32_e32 v4, v4
+; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX90A-NEXT: s_movk_i32 s6, 0x7fff
+; GFX90A-NEXT: .LBB62_1: ; %atomicrmw.start
+; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v7, v5
+; GFX90A-NEXT: v_lshrrev_b32_sdwa v5, v3, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX90A-NEXT: v_add_f32_e32 v5, v5, v2
+; GFX90A-NEXT: v_bfe_u32 v6, v5, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX90A-NEXT: v_add3_u32 v6, v6, v5, s6
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX90A-NEXT: v_cndmask_b32_e32 v5, v6, v8, vcc
+; GFX90A-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX90A-NEXT: v_and_or_b32 v6, v7, v4, v5
+; GFX90A-NEXT: buffer_wbl2
+; GFX90A-NEXT: global_atomic_cmpswap v5, v[0:1], v[6:7], off glc
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: buffer_invl2
+; GFX90A-NEXT: buffer_wbinvl1
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7
+; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: s_cbranch_execnz .LBB62_1
+; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX90A-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: global_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX908: ; %bb.0:
+; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0x7fe, v0
+; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX908-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX908-NEXT: global_load_dword v5, v[0:1], off
+; GFX908-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX908-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX908-NEXT: s_mov_b32 s4, 0xffff
+; GFX908-NEXT: v_lshlrev_b32_e64 v4, v3, s4
+; GFX908-NEXT: v_not_b32_e32 v4, v4
+; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX908-NEXT: s_movk_i32 s6, 0x7fff
+; GFX908-NEXT: .LBB62_1: ; %atomicrmw.start
+; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: v_mov_b32_e32 v6, v5
+; GFX908-NEXT: v_lshrrev_b32_sdwa v5, v3, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX908-NEXT: v_add_f32_e32 v5, v5, v2
+; GFX908-NEXT: v_bfe_u32 v7, v5, 16, 1
+; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v5
+; GFX908-NEXT: v_add3_u32 v7, v7, v5, s6
+; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX908-NEXT: v_cndmask_b32_e32 v5, v7, v8, vcc
+; GFX908-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX908-NEXT: v_and_or_b32 v5, v6, v4, v5
+; GFX908-NEXT: global_atomic_cmpswap v5, v[0:1], v[5:6], off glc
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: buffer_wbinvl1
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6
+; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX908-NEXT: s_cbranch_execnz .LBB62_1
+; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX908-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: global_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fe, v0
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX8-NEXT: flat_load_dword v5, v[0:1]
+; GFX8-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX8-NEXT: s_mov_b32 s4, 0xffff
+; GFX8-NEXT: v_lshlrev_b32_e64 v4, v3, s4
+; GFX8-NEXT: v_not_b32_e32 v4, v4
+; GFX8-NEXT: s_mov_b64 s[4:5], 0
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX8-NEXT: .LBB62_1: ; %atomicrmw.start
+; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v6, v5
+; GFX8-NEXT: v_lshrrev_b32_sdwa v5, v3, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_add_f32_e32 v5, v5, v2
+; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5
+; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8
+; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX8-NEXT: v_and_b32_e32 v7, v6, v4
+; GFX8-NEXT: v_lshlrev_b32_sdwa v5, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT: v_or_b32_e32 v5, v7, v5
+; GFX8-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: buffer_wbinvl1
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v6
+; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX8-NEXT: s_cbranch_execnz .LBB62_1
+; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: global_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0
+; GFX7-NEXT: s_mov_b32 s6, 0
+; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX7-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX7-NEXT: s_mov_b32 s7, 0xf000
+; GFX7-NEXT: s_mov_b32 s4, s6
+; GFX7-NEXT: s_mov_b32 s5, s6
+; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v6, 3, v4
+; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v6
+; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT: v_not_b32_e32 v7, v4
+; GFX7-NEXT: s_mov_b64 s[8:9], 0
+; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v2
+; GFX7-NEXT: .LBB62_1: ; %atomicrmw.start
+; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_lshrrev_b32_e32 v2, v6, v3
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX7-NEXT: v_add_f32_e32 v2, v2, v8
+; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX7-NEXT: v_and_b32_e32 v4, v3, v7
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, v6, v2
+; GFX7-NEXT: v_or_b32_e32 v2, v4, v2
+; GFX7-NEXT: v_mov_b32_e32 v5, v3
+; GFX7-NEXT: v_mov_b32_e32 v4, v2
+; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: buffer_wbinvl1
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT: v_mov_b32_e32 v3, v4
+; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
+; GFX7-NEXT: s_cbranch_execnz .LBB62_1
+; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, v6, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX6-LABEL: global_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0
+; GFX6-NEXT: s_mov_b32 s6, 0
+; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX6-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX6-NEXT: s_mov_b32 s7, 0xf000
+; GFX6-NEXT: s_mov_b32 s4, s6
+; GFX6-NEXT: s_mov_b32 s5, s6
+; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
+; GFX6-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX6-NEXT: v_lshlrev_b32_e32 v6, 3, v4
+; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v6
+; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GFX6-NEXT: v_not_b32_e32 v7, v4
+; GFX6-NEXT: s_mov_b64 s[8:9], 0
+; GFX6-NEXT: v_and_b32_e32 v8, 0xffff0000, v2
+; GFX6-NEXT: .LBB62_1: ; %atomicrmw.start
+; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_lshrrev_b32_e32 v2, v6, v3
+; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX6-NEXT: v_add_f32_e32 v2, v2, v8
+; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_and_b32_e32 v4, v3, v7
+; GFX6-NEXT: v_lshlrev_b32_e32 v2, v6, v2
+; GFX6-NEXT: v_or_b32_e32 v2, v4, v2
+; GFX6-NEXT: v_mov_b32_e32 v5, v3
+; GFX6-NEXT: v_mov_b32_e32 v4, v2
+; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: buffer_wbinvl1
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT: v_mov_b32_e32 v3, v4
+; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
+; GFX6-NEXT: s_cbranch_execnz .LBB62_1
+; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
+; GFX6-NEXT: v_lshrrev_b32_e32 v0, v6, v4
+; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: s_setpc_b64 s[30:31]
+ %gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 1023
+ %result = atomicrmw fadd ptr addrspace(1) %gep, bfloat %val seq_cst, !amdgpu.no.fine.grained.memory !0
+ ret bfloat %result
+}
+
+define void @global_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 {
+; GFX12-LABEL: global_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
+; GFX12-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX12-NEXT: v_lshlrev_b32_e32 v6, 16, v2
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX12-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX12-NEXT: s_mov_b32 s0, 0
+; GFX12-NEXT: global_load_b32 v3, v[0:1], off
+; GFX12-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX12-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_not_b32_e32 v5, v5
+; GFX12-NEXT: .LBB63_1: ; %atomicrmw.start
+; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX12-NEXT: v_add_f32_e32 v2, v2, v6
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo
+; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_lshlrev_b32_e32 v2, v4, v2
+; GFX12-NEXT: v_and_or_b32 v2, v3, v5, v2
+; GFX12-NEXT: s_wait_storecnt 0x0
+; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: global_inv scope:SCOPE_SYS
+; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_cbranch_execnz .LBB63_1
+; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: global_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX940: ; %bb.0:
+; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT: s_mov_b64 s[0:1], 0x7fe
+; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
+; GFX940-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX940-NEXT: v_mov_b32_e32 v1, v5
+; GFX940-NEXT: global_load_dword v3, v[0:1], off
+; GFX940-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX940-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX940-NEXT: s_mov_b32 s0, 0xffff
+; GFX940-NEXT: v_lshlrev_b32_e64 v5, v4, s0
+; GFX940-NEXT: v_not_b32_e32 v5, v5
+; GFX940-NEXT: s_mov_b64 s[0:1], 0
+; GFX940-NEXT: v_lshlrev_b32_e32 v6, 16, v2
+; GFX940-NEXT: s_movk_i32 s2, 0x7fff
+; GFX940-NEXT: .LBB63_1: ; %atomicrmw.start
+; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX940-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX940-NEXT: s_nop 0
+; GFX940-NEXT: v_add_f32_e32 v2, v2, v6
+; GFX940-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX940-NEXT: v_add3_u32 v7, v7, v2, s2
+; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; GFX940-NEXT: s_nop 1
+; GFX940-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc
+; GFX940-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX940-NEXT: v_and_or_b32 v2, v3, v5, v2
+; GFX940-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 sc1
+; GFX940-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NEXT: buffer_inv sc0 sc1
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX940-NEXT: v_mov_b32_e32 v3, v2
+; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX940-NEXT: s_cbranch_execnz .LBB63_1
+; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX940-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX940-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: global_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
+; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX11-NEXT: v_lshlrev_b32_e32 v6, 16, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX11-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX11-NEXT: s_mov_b32 s0, 0
+; GFX11-NEXT: global_load_b32 v3, v[0:1], off
+; GFX11-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX11-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_not_b32_e32 v5, v5
+; GFX11-NEXT: .p2align 6
+; GFX11-NEXT: .LBB63_1: ; %atomicrmw.start
+; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-NEXT: v_add_f32_e32 v2, v2, v6
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo
+; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_lshlrev_b32_e32 v2, v4, v2
+; GFX11-NEXT: v_and_or_b32 v2, v3, v5, v2
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: buffer_gl1_inv
+; GFX11-NEXT: buffer_gl0_inv
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-NEXT: v_mov_b32_e32 v3, v2
+; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT: s_cbranch_execnz .LBB63_1
+; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: global_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, 0x7fe, v0
+; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_e32 v6, 16, v2
+; GFX10-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX10-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX10-NEXT: s_mov_b32 s4, 0
+; GFX10-NEXT: global_load_dword v3, v[0:1], off
+; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX10-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
+; GFX10-NEXT: v_not_b32_e32 v5, v5
+; GFX10-NEXT: .LBB63_1: ; %atomicrmw.start
+; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX10-NEXT: v_add_f32_e32 v2, v2, v6
+; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
+; GFX10-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX10-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc_lo
+; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX10-NEXT: v_and_or_b32 v2, v3, v5, v2
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: buffer_gl1_inv
+; GFX10-NEXT: buffer_gl0_inv
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-NEXT: v_mov_b32_e32 v3, v2
+; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT: s_cbranch_execnz .LBB63_1
+; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: global_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX90A: ; %bb.0:
+; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0
+; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX90A-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX90A-NEXT: global_load_dword v3, v[0:1], off
+; GFX90A-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX90A-NEXT: s_mov_b32 s4, 0xffff
+; GFX90A-NEXT: v_lshlrev_b32_e64 v5, v4, s4
+; GFX90A-NEXT: v_not_b32_e32 v5, v5
+; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: v_lshlrev_b32_e32 v6, 16, v2
+; GFX90A-NEXT: s_movk_i32 s6, 0x7fff
+; GFX90A-NEXT: .LBB63_1: ; %atomicrmw.start
+; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX90A-NEXT: v_add_f32_e32 v2, v2, v6
+; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s6
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; GFX90A-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc
+; GFX90A-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX90A-NEXT: v_and_or_b32 v2, v3, v5, v2
+; GFX90A-NEXT: buffer_wbl2
+; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: buffer_invl2
+; GFX90A-NEXT: buffer_wbinvl1
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
+; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: s_cbranch_execnz .LBB63_1
+; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: global_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX908: ; %bb.0:
+; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT: v_add_co_u32_e32 v4, vcc, 0x7fe, v0
+; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX908-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX908-NEXT: global_load_dword v3, v[0:1], off
+; GFX908-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX908-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX908-NEXT: s_mov_b32 s4, 0xffff
+; GFX908-NEXT: v_lshlrev_b32_e64 v5, v4, s4
+; GFX908-NEXT: v_not_b32_e32 v5, v5
+; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: v_lshlrev_b32_e32 v6, 16, v2
+; GFX908-NEXT: s_movk_i32 s6, 0x7fff
+; GFX908-NEXT: .LBB63_1: ; %atomicrmw.start
+; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX908-NEXT: v_add_f32_e32 v2, v2, v6
+; GFX908-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX908-NEXT: v_add3_u32 v7, v7, v2, s6
+; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; GFX908-NEXT: v_cndmask_b32_e32 v2, v7, v8, vcc
+; GFX908-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX908-NEXT: v_and_or_b32 v2, v3, v5, v2
+; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: buffer_wbinvl1
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT: v_mov_b32_e32 v3, v2
+; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX908-NEXT: s_cbranch_execnz .LBB63_1
+; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: global_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x7fe, v0
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX8-NEXT: flat_load_dword v3, v[0:1]
+; GFX8-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX8-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX8-NEXT: s_mov_b32 s4, 0xffff
+; GFX8-NEXT: v_lshlrev_b32_e64 v5, v4, s4
+; GFX8-NEXT: v_not_b32_e32 v5, v5
+; GFX8-NEXT: s_mov_b64 s[4:5], 0
+; GFX8-NEXT: v_lshlrev_b32_e32 v6, 16, v2
+; GFX8-NEXT: .LBB63_1: ; %atomicrmw.start
+; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_lshrrev_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT: v_add_f32_e32 v2, v2, v6
+; GFX8-NEXT: v_bfe_u32 v8, v2, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v2
+; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8
+; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; GFX8-NEXT: v_cndmask_b32_e32 v2, v8, v9, vcc
+; GFX8-NEXT: v_and_b32_e32 v7, v3, v5
+; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT: v_or_b32_e32 v2, v7, v2
+; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: buffer_wbinvl1
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT: v_mov_b32_e32 v3, v2
+; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX8-NEXT: s_cbranch_execnz .LBB63_1
+; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: global_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0
+; GFX7-NEXT: s_mov_b32 s6, 0
+; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX7-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX7-NEXT: s_mov_b32 s7, 0xf000
+; GFX7-NEXT: s_mov_b32 s4, s6
+; GFX7-NEXT: s_mov_b32 s5, s6
+; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX7-NEXT: v_lshl_b32_e32 v5, 0xffff, v4
+; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT: v_not_b32_e32 v5, v5
+; GFX7-NEXT: s_mov_b64 s[8:9], 0
+; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
+; GFX7-NEXT: .LBB63_1: ; %atomicrmw.start
+; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX7-NEXT: v_add_f32_e32 v2, v2, v6
+; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX7-NEXT: v_and_b32_e32 v7, v3, v5
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, v4, v2
+; GFX7-NEXT: v_or_b32_e32 v2, v7, v2
+; GFX7-NEXT: v_mov_b32_e32 v8, v3
+; GFX7-NEXT: v_mov_b32_e32 v7, v2
+; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: buffer_wbinvl1
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3
+; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT: v_mov_b32_e32 v3, v7
+; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
+; GFX7-NEXT: s_cbranch_execnz .LBB63_1
+; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX6-LABEL: global_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0x7fe, v0
+; GFX6-NEXT: s_mov_b32 s6, 0
+; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX6-NEXT: v_and_b32_e32 v0, -4, v4
+; GFX6-NEXT: s_mov_b32 s7, 0xf000
+; GFX6-NEXT: s_mov_b32 s4, s6
+; GFX6-NEXT: s_mov_b32 s5, s6
+; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
+; GFX6-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX6-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX6-NEXT: v_lshl_b32_e32 v5, 0xffff, v4
+; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GFX6-NEXT: v_not_b32_e32 v5, v5
+; GFX6-NEXT: s_mov_b64 s[8:9], 0
+; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
+; GFX6-NEXT: .LBB63_1: ; %atomicrmw.start
+; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX6-NEXT: v_add_f32_e32 v2, v2, v6
+; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_and_b32_e32 v7, v3, v5
+; GFX6-NEXT: v_lshlrev_b32_e32 v2, v4, v2
+; GFX6-NEXT: v_or_b32_e32 v2, v7, v2
+; GFX6-NEXT: v_mov_b32_e32 v8, v3
+; GFX6-NEXT: v_mov_b32_e32 v7, v2
+; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: buffer_wbinvl1
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3
+; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT: v_mov_b32_e32 v3, v7
+; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
+; GFX6-NEXT: s_cbranch_execnz .LBB63_1
+; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: s_setpc_b64 s[30:31]
+ %gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 1023
+ %unused = atomicrmw fadd ptr addrspace(1) %gep, bfloat %val seq_cst, !amdgpu.no.fine.grained.memory !0
+ ret void
+}
+
+; --------------------------------------------------------------------
+; <2 x half>
+; --------------------------------------------------------------------
+
+define <2 x half> @global_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x half> %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memory:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: s_wait_storecnt 0x0
+; GFX12-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: global_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memory:
+; GFX940: ; %bb.0:
+; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT: buffer_wbl2 sc1
+; GFX940-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off sc0
+; GFX940-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NEXT: buffer_inv sc1
+; GFX940-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: global_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memory:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: global_load_b32 v3, v[0:1], off
+; GFX11-NEXT: s_mov_b32 s0, 0
+; GFX11-NEXT: .LBB64_1: ; %atomicrmw.start
+; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_pk_add_f16 v3, v4, v2
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: buffer_gl1_inv
+; GFX11-NEXT: buffer_gl0_inv
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT: s_cbranch_execnz .LBB64_1
+; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT: v_mov_b32_e32 v0, v3
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: global_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memory:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: global_load_dword v3, v[0:1], off
+; GFX10-NEXT: s_mov_b32 s4, 0
+; GFX10-NEXT: .LBB64_1: ; %atomicrmw.start
+; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_mov_b32_e32 v4, v3
+; GFX10-NEXT: v_pk_add_f16 v3, v4, v2
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: buffer_gl1_inv
+; GFX10-NEXT: buffer_gl0_inv
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT: s_cbranch_execnz .LBB64_1
+; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT: v_mov_b32_e32 v0, v3
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: global_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memory:
+; GFX90A: ; %bb.0:
+; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off glc
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: buffer_wbinvl1
+; GFX90A-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: global_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memory:
+; GFX908: ; %bb.0:
+; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT: global_load_dword v3, v[0:1], off
+; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: .LBB64_1: ; %atomicrmw.start
+; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: v_mov_b32_e32 v4, v3
+; GFX908-NEXT: v_pk_add_f16 v3, v4, v2
+; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: buffer_wbinvl1
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX908-NEXT: s_cbranch_execnz .LBB64_1
+; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT: v_mov_b32_e32 v0, v3
+; GFX908-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: global_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memory:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: flat_load_dword v3, v[0:1]
+; GFX8-NEXT: s_mov_b64 s[4:5], 0
+; GFX8-NEXT: .LBB64_1: ; %atomicrmw.start
+; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v4, v3
+; GFX8-NEXT: v_add_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_add_f16_e32 v5, v4, v2
+; GFX8-NEXT: v_or_b32_e32 v3, v5, v3
+; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: buffer_wbinvl1
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX8-NEXT: s_cbranch_execnz .LBB64_1
+; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT: v_mov_b32_e32 v0, v3
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: global_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memory:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: s_mov_b32 s6, 0
+; GFX7-NEXT: s_mov_b32 s7, 0xf000
+; GFX7-NEXT: s_mov_b32 s4, s6
+; GFX7-NEXT: s_mov_b32 s5, s6
+; GFX7-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2
+; GFX7-NEXT: s_mov_b64 s[8:9], 0
+; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v5
+; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v5
+; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3
+; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v6
+; GFX7-NEXT: .LBB64_1: ; %atomicrmw.start
+; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v3
+; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v2
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_add_f32_e32 v6, v6, v4
+; GFX7-NEXT: v_add_f32_e32 v7, v7, v5
+; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6
+; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7
+; GFX7-NEXT: v_or_b32_e32 v7, v2, v3
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v6
+; GFX7-NEXT: v_or_b32_e32 v6, v8, v2
+; GFX7-NEXT: v_mov_b32_e32 v9, v7
+; GFX7-NEXT: v_mov_b32_e32 v8, v6
+; GFX7-NEXT: buffer_atomic_cmpswap v[8:9], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: buffer_wbinvl1
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v8
+; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v8
+; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7
+; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
+; GFX7-NEXT: s_cbranch_execnz .LBB64_1
+; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
+; GFX7-NEXT: v_mov_b32_e32 v0, v2
+; GFX7-NEXT: v_mov_b32_e32 v1, v3
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX6-LABEL: global_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memory:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT: s_mov_b32 s6, 0
+; GFX6-NEXT: s_mov_b32 s7, 0xf000
+; GFX6-NEXT: s_mov_b32 s4, s6
+; GFX6-NEXT: s_mov_b32 s5, s6
+; GFX6-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64
+; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v2
+; GFX6-NEXT: s_mov_b64 s[8:9], 0
+; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v3
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v5
+; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v5
+; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3
+; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v6
+; GFX6-NEXT: .LBB64_1: ; %atomicrmw.start
+; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v3
+; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v2
+; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX6-NEXT: v_add_f32_e32 v6, v6, v4
+; GFX6-NEXT: v_add_f32_e32 v7, v7, v5
+; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v7
+; GFX6-NEXT: v_or_b32_e32 v7, v2, v3
+; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v6
+; GFX6-NEXT: v_or_b32_e32 v6, v8, v2
+; GFX6-NEXT: v_mov_b32_e32 v9, v7
+; GFX6-NEXT: v_mov_b32_e32 v8, v6
+; GFX6-NEXT: buffer_atomic_cmpswap v[8:9], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: buffer_wbinvl1
+; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v8
+; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v8
+; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7
+; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
+; GFX6-NEXT: s_cbranch_execnz .LBB64_1
+; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
+; GFX6-NEXT: v_mov_b32_e32 v0, v2
+; GFX6-NEXT: v_mov_b32_e32 v1, v3
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: s_setpc_b64 s[30:31]
+ %result = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+ ret <2 x half> %result
+}
+
+define <2 x half> @global_agent_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x half> %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: s_wait_storecnt 0x0
+; GFX12-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: global_agent_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX940: ; %bb.0:
+; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT: buffer_wbl2 sc1
+; GFX940-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off offset:2044 sc0
+; GFX940-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NEXT: buffer_inv sc1
+; GFX940-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: global_agent_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044
+; GFX11-NEXT: s_mov_b32 s0, 0
+; GFX11-NEXT: .LBB65_1: ; %atomicrmw.start
+; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_pk_add_f16 v3, v4, v2
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 glc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: buffer_gl1_inv
+; GFX11-NEXT: buffer_gl0_inv
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT: s_cbranch_execnz .LBB65_1
+; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT: v_mov_b32_e32 v0, v3
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: global_agent_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044
+; GFX10-NEXT: s_mov_b32 s4, 0
+; GFX10-NEXT: .LBB65_1: ; %atomicrmw.start
+; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_mov_b32_e32 v4, v3
+; GFX10-NEXT: v_pk_add_f16 v3, v4, v2
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: buffer_gl1_inv
+; GFX10-NEXT: buffer_gl0_inv
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT: s_cbranch_execnz .LBB65_1
+; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT: v_mov_b32_e32 v0, v3
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: global_agent_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX90A: ; %bb.0:
+; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off offset:2044 glc
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: buffer_wbinvl1
+; GFX90A-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: global_agent_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX908: ; %bb.0:
+; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044
+; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: .LBB65_1: ; %atomicrmw.start
+; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: v_mov_b32_e32 v4, v3
+; GFX908-NEXT: v_pk_add_f16 v3, v4, v2
+; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: buffer_wbinvl1
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX908-NEXT: s_cbranch_execnz .LBB65_1
+; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT: v_mov_b32_e32 v0, v3
+; GFX908-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: global_agent_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fc, v0
+; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc
+; GFX8-NEXT: flat_load_dword v0, v[3:4]
+; GFX8-NEXT: s_mov_b64 s[4:5], 0
+; GFX8-NEXT: .LBB65_1: ; %atomicrmw.start
+; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v1, v0
+; GFX8-NEXT: v_add_f16_sdwa v0, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_add_f16_e32 v5, v1, v2
+; GFX8-NEXT: v_or_b32_e32 v0, v5, v0
+; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: buffer_wbinvl1
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX8-NEXT: s_cbranch_execnz .LBB65_1
+; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: global_agent_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: s_mov_b32 s6, 0
+; GFX7-NEXT: s_mov_b32 s7, 0xf000
+; GFX7-NEXT: s_mov_b32 s4, s6
+; GFX7-NEXT: s_mov_b32 s5, s6
+; GFX7-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:2044
+; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2
+; GFX7-NEXT: s_mov_b64 s[8:9], 0
+; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v5
+; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v5
+; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3
+; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v6
+; GFX7-NEXT: .LBB65_1: ; %atomicrmw.start
+; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v3
+; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v2
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_add_f32_e32 v6, v6, v4
+; GFX7-NEXT: v_add_f32_e32 v7, v7, v5
+; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6
+; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7
+; GFX7-NEXT: v_or_b32_e32 v7, v2, v3
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v6
+; GFX7-NEXT: v_or_b32_e32 v6, v8, v2
+; GFX7-NEXT: v_mov_b32_e32 v9, v7
+; GFX7-NEXT: v_mov_b32_e32 v8, v6
+; GFX7-NEXT: buffer_atomic_cmpswap v[8:9], v[0:1], s[4:7], 0 addr64 offset:2044 glc
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: buffer_wbinvl1
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v8
+; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v8
+; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7
+; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
+; GFX7-NEXT: s_cbranch_execnz .LBB65_1
+; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
+; GFX7-NEXT: v_mov_b32_e32 v0, v2
+; GFX7-NEXT: v_mov_b32_e32 v1, v3
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX6-LABEL: global_agent_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT: s_mov_b32 s6, 0
+; GFX6-NEXT: s_mov_b32 s7, 0xf000
+; GFX6-NEXT: s_mov_b32 s4, s6
+; GFX6-NEXT: s_mov_b32 s5, s6
+; GFX6-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:2044
+; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v2
+; GFX6-NEXT: s_mov_b64 s[8:9], 0
+; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v3
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v5
+; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v5
+; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3
+; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v6
+; GFX6-NEXT: .LBB65_1: ; %atomicrmw.start
+; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v3
+; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v2
+; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX6-NEXT: v_add_f32_e32 v6, v6, v4
+; GFX6-NEXT: v_add_f32_e32 v7, v7, v5
+; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v7
+; GFX6-NEXT: v_or_b32_e32 v7, v2, v3
+; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v6
+; GFX6-NEXT: v_or_b32_e32 v6, v8, v2
+; GFX6-NEXT: v_mov_b32_e32 v9, v7
+; GFX6-NEXT: v_mov_b32_e32 v8, v6
+; GFX6-NEXT: buffer_atomic_cmpswap v[8:9], v[0:1], s[4:7], 0 addr64 offset:2044 glc
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: buffer_wbinvl1
+; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v8
+; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v8
+; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7
+; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
+; GFX6-NEXT: s_cbranch_execnz .LBB65_1
+; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
+; GFX6-NEXT: v_mov_b32_e32 v0, v2
+; GFX6-NEXT: v_mov_b32_e32 v1, v3
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: s_setpc_b64 s[30:31]
+ %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i64 511
+ %result = atomicrmw fadd ptr addrspace(1) %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+ ret <2 x half> %result
+}
+
+define <2 x half> @global_agent_atomic_fadd_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x half> %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fadd_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: s_wait_storecnt 0x0
+; GFX12-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off offset:-2048 th:TH_ATOMIC_RETURN
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: global_agent_atomic_fadd_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX940: ; %bb.0:
+; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT: buffer_wbl2 sc1
+; GFX940-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off offset:-2048 sc0
+; GFX940-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NEXT: buffer_inv sc1
+; GFX940-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: global_agent_atomic_fadd_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:-2048
+; GFX11-NEXT: s_mov_b32 s0, 0
+; GFX11-NEXT: .LBB66_1: ; %atomicrmw.start
+; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_pk_add_f16 v3, v4, v2
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:-2048 glc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: buffer_gl1_inv
+; GFX11-NEXT: buffer_gl0_inv
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT: s_cbranch_execnz .LBB66_1
+; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT: v_mov_b32_e32 v0, v3
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: global_agent_atomic_fadd_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:-2048
+; GFX10-NEXT: s_mov_b32 s4, 0
+; GFX10-NEXT: .LBB66_1: ; %atomicrmw.start
+; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_mov_b32_e32 v4, v3
+; GFX10-NEXT: v_pk_add_f16 v3, v4, v2
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: buffer_gl1_inv
+; GFX10-NEXT: buffer_gl0_inv
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT: s_cbranch_execnz .LBB66_1
+; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT: v_mov_b32_e32 v0, v3
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: global_agent_atomic_fadd_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX90A: ; %bb.0:
+; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off offset:-2048 glc
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: buffer_wbinvl1
+; GFX90A-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: global_agent_atomic_fadd_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX908: ; %bb.0:
+; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:-2048
+; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: .LBB66_1: ; %atomicrmw.start
+; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: v_mov_b32_e32 v4, v3
+; GFX908-NEXT: v_pk_add_f16 v3, v4, v2
+; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: buffer_wbinvl1
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX908-NEXT: s_cbranch_execnz .LBB66_1
+; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT: v_mov_b32_e32 v0, v3
+; GFX908-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: global_agent_atomic_fadd_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0xfffff800, v0
+; GFX8-NEXT: v_addc_u32_e32 v4, vcc, -1, v1, vcc
+; GFX8-NEXT: flat_load_dword v0, v[3:4]
+; GFX8-NEXT: s_mov_b64 s[4:5], 0
+; GFX8-NEXT: .LBB66_1: ; %atomicrmw.start
+; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v1, v0
+; GFX8-NEXT: v_add_f16_sdwa v0, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_add_f16_e32 v5, v1, v2
+; GFX8-NEXT: v_or_b32_e32 v0, v5, v0
+; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: buffer_wbinvl1
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX8-NEXT: s_cbranch_execnz .LBB66_1
+; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: global_agent_atomic_fadd_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: s_movk_i32 s4, 0xf800
+; GFX7-NEXT: s_mov_b32 s5, -1
+; GFX7-NEXT: s_mov_b32 s7, 0xf000
+; GFX7-NEXT: s_mov_b32 s6, 0
+; GFX7-NEXT: buffer_load_dword v6, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0xfffff800, v0
+; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v3
+; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v2
+; GFX7-NEXT: v_addc_u32_e32 v5, vcc, -1, v1, vcc
+; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v0
+; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3
+; GFX7-NEXT: s_mov_b64 s[8:9], 0
+; GFX7-NEXT: s_mov_b32 s4, s6
+; GFX7-NEXT: s_mov_b32 s5, s6
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v6
+; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v6
+; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT: .LBB66_1: ; %atomicrmw.start
+; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v1
+; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_add_f32_e32 v6, v6, v2
+; GFX7-NEXT: v_add_f32_e32 v7, v7, v3
+; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6
+; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7
+; GFX7-NEXT: v_or_b32_e32 v7, v0, v1
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v6
+; GFX7-NEXT: v_or_b32_e32 v6, v8, v0
+; GFX7-NEXT: v_mov_b32_e32 v9, v7
+; GFX7-NEXT: v_mov_b32_e32 v8, v6
+; GFX7-NEXT: buffer_atomic_cmpswap v[8:9], v[4:5], s[4:7], 0 addr64 glc
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: buffer_wbinvl1
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v8
+; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v8
+; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7
+; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
+; GFX7-NEXT: s_cbranch_execnz .LBB66_1
+; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX6-LABEL: global_agent_atomic_fadd_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT: s_movk_i32 s4, 0xf800
+; GFX6-NEXT: s_mov_b32 s5, -1
+; GFX6-NEXT: s_mov_b32 s7, 0xf000
+; GFX6-NEXT: s_mov_b32 s6, 0
+; GFX6-NEXT: buffer_load_dword v6, v[0:1], s[4:7], 0 addr64
+; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0xfffff800, v0
+; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v3
+; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v2
+; GFX6-NEXT: v_addc_u32_e32 v5, vcc, -1, v1, vcc
+; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v0
+; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3
+; GFX6-NEXT: s_mov_b64 s[8:9], 0
+; GFX6-NEXT: s_mov_b32 s4, s6
+; GFX6-NEXT: s_mov_b32 s5, s6
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v6
+; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v6
+; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX6-NEXT: .LBB66_1: ; %atomicrmw.start
+; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v1
+; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v0
+; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX6-NEXT: v_add_f32_e32 v6, v6, v2
+; GFX6-NEXT: v_add_f32_e32 v7, v7, v3
+; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v7
+; GFX6-NEXT: v_or_b32_e32 v7, v0, v1
+; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v6
+; GFX6-NEXT: v_or_b32_e32 v6, v8, v0
+; GFX6-NEXT: v_mov_b32_e32 v9, v7
+; GFX6-NEXT: v_mov_b32_e32 v8, v6
+; GFX6-NEXT: buffer_atomic_cmpswap v[8:9], v[4:5], s[4:7], 0 addr64 glc
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: buffer_wbinvl1
+; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v8
+; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v8
+; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7
+; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
+; GFX6-NEXT: s_cbranch_execnz .LBB66_1
+; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: s_setpc_b64 s[30:31]
+ %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i64 -512
+ %result = atomicrmw fadd ptr addrspace(1) %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+ ret <2 x half> %result
+}
+
+define void @global_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x half> %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: s_wait_storecnt 0x0
+; GFX12-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off
+; GFX12-NEXT: s_wait_storecnt 0x0
+; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: global_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory:
+; GFX940: ; %bb.0:
+; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT: buffer_wbl2 sc1
+; GFX940-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off
+; GFX940-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NEXT: buffer_inv sc1
+; GFX940-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: global_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: global_load_b32 v4, v[0:1], off
+; GFX11-NEXT: s_mov_b32 s0, 0
+; GFX11-NEXT: .LBB67_1: ; %atomicrmw.start
+; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_pk_add_f16 v3, v4, v2
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: buffer_gl1_inv
+; GFX11-NEXT: buffer_gl0_inv
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT: s_cbranch_execnz .LBB67_1
+; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: global_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: global_load_dword v4, v[0:1], off
+; GFX10-NEXT: s_mov_b32 s4, 0
+; GFX10-NEXT: .LBB67_1: ; %atomicrmw.start
+; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_pk_add_f16 v3, v4, v2
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: buffer_gl1_inv
+; GFX10-NEXT: buffer_gl0_inv
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX10-NEXT: v_mov_b32_e32 v4, v3
+; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT: s_cbranch_execnz .LBB67_1
+; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: global_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory:
+; GFX90A: ; %bb.0:
+; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: buffer_wbinvl1
+; GFX90A-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: global_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory:
+; GFX908: ; %bb.0:
+; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT: global_load_dword v4, v[0:1], off
+; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: .LBB67_1: ; %atomicrmw.start
+; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: v_pk_add_f16 v3, v4, v2
+; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: buffer_wbinvl1
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT: v_mov_b32_e32 v4, v3
+; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX908-NEXT: s_cbranch_execnz .LBB67_1
+; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: global_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: flat_load_dword v4, v[0:1]
+; GFX8-NEXT: s_mov_b64 s[4:5], 0
+; GFX8-NEXT: .LBB67_1: ; %atomicrmw.start
+; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_add_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_add_f16_e32 v5, v4, v2
+; GFX8-NEXT: v_or_b32_e32 v3, v5, v3
+; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: buffer_wbinvl1
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT: v_mov_b32_e32 v4, v3
+; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX8-NEXT: s_cbranch_execnz .LBB67_1
+; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: global_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: s_mov_b32 s6, 0
+; GFX7-NEXT: s_mov_b32 s7, 0xf000
+; GFX7-NEXT: s_mov_b32 s4, s6
+; GFX7-NEXT: s_mov_b32 s5, s6
+; GFX7-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2
+; GFX7-NEXT: s_mov_b64 s[8:9], 0
+; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v3
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v5
+; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v5
+; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3
+; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6
+; GFX7-NEXT: .LBB67_1: ; %atomicrmw.start
+; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5
+; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4
+; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5
+; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX7-NEXT: v_add_f32_e32 v6, v6, v2
+; GFX7-NEXT: v_add_f32_e32 v7, v7, v3
+; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6
+; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7
+; GFX7-NEXT: v_or_b32_e32 v6, v4, v5
+; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8
+; GFX7-NEXT: v_or_b32_e32 v5, v7, v4
+; GFX7-NEXT: v_mov_b32_e32 v8, v6
+; GFX7-NEXT: v_mov_b32_e32 v7, v5
+; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: buffer_wbinvl1
+; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7
+; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7
+; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6
+; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
+; GFX7-NEXT: s_cbranch_execnz .LBB67_1
+; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX6-LABEL: global_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT: s_mov_b32 s6, 0
+; GFX6-NEXT: s_mov_b32 s7, 0xf000
+; GFX6-NEXT: s_mov_b32 s4, s6
+; GFX6-NEXT: s_mov_b32 s5, s6
+; GFX6-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64
+; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v2
+; GFX6-NEXT: s_mov_b64 s[8:9], 0
+; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v3
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v5
+; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v5
+; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v3
+; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6
+; GFX6-NEXT: .LBB67_1: ; %atomicrmw.start
+; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5
+; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4
+; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v5
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v4
+; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX6-NEXT: v_add_f32_e32 v6, v6, v2
+; GFX6-NEXT: v_add_f32_e32 v7, v7, v3
+; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v6
+; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v7
+; GFX6-NEXT: v_or_b32_e32 v6, v4, v5
+; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v8
+; GFX6-NEXT: v_or_b32_e32 v5, v7, v4
+; GFX6-NEXT: v_mov_b32_e32 v8, v6
+; GFX6-NEXT: v_mov_b32_e32 v7, v5
+; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: buffer_wbinvl1
+; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v7
+; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v7
+; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6
+; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
+; GFX6-NEXT: s_cbranch_execnz .LBB67_1
+; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: s_setpc_b64 s[30:31]
+ %unused = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+ ret void
+}
+
+define void @global_agent_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x half> %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: s_wait_storecnt 0x0
+; GFX12-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off offset:2044
+; GFX12-NEXT: s_wait_storecnt 0x0
+; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: global_agent_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX940: ; %bb.0:
+; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT: buffer_wbl2 sc1
+; GFX940-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off offset:2044
+; GFX940-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NEXT: buffer_inv sc1
+; GFX940-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: global_agent_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: global_load_b32 v4, v[0:1], off offset:2044
+; GFX11-NEXT: s_mov_b32 s0, 0
+; GFX11-NEXT: .LBB68_1: ; %atomicrmw.start
+; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_pk_add_f16 v3, v4, v2
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 glc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: buffer_gl1_inv
+; GFX11-NEXT: buffer_gl0_inv
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT: s_cbranch_execnz .LBB68_1
+; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: global_agent_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:2044
+; GFX10-NEXT: s_mov_b32 s4, 0
+; GFX10-NEXT: .LBB68_1: ; %atomicrmw.start
+; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_pk_add_f16 v3, v4, v2
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: buffer_gl1_inv
+; GFX10-NEXT: buffer_gl0_inv
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX10-NEXT: v_mov_b32_e32 v4, v3
+; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT: s_cbranch_execnz .LBB68_1
+; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: global_agent_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX90A: ; %bb.0:
+; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off offset:2044
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: buffer_wbinvl1
+; GFX90A-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: global_agent_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX908: ; %bb.0:
+; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT: global_load_dword v4, v[0:1], off offset:2044
+; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: .LBB68_1: ; %atomicrmw.start
+; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: v_pk_add_f16 v3, v4, v2
+; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: buffer_wbinvl1
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT: v_mov_b32_e32 v4, v3
+; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX908-NEXT: s_cbranch_execnz .LBB68_1
+; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: global_agent_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT: flat_load_dword v4, v[0:1]
+; GFX8-NEXT: s_mov_b64 s[4:5], 0
+; GFX8-NEXT: .LBB68_1: ; %atomicrmw.start
+; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_add_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_add_f16_e32 v5, v4, v2
+; GFX8-NEXT: v_or_b32_e32 v3, v5, v3
+; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: buffer_wbinvl1
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT: v_mov_b32_e32 v4, v3
+; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX8-NEXT: s_cbranch_execnz .LBB68_1
+; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: global_agent_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: s_mov_b32 s6, 0
+; GFX7-NEXT: s_mov_b32 s7, 0xf000
+; GFX7-NEXT: s_mov_b32 s4, s6
+; GFX7-NEXT: s_mov_b32 s5, s6
+; GFX7-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:2044
+; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2
+; GFX7-NEXT: s_mov_b64 s[8:9], 0
+; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v3
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v5
+; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v5
+; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3
+; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6
+; GFX7-NEXT: .LBB68_1: ; %atomicrmw.start
+; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5
+; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4
+; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5
+; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX7-NEXT: v_add_f32_e32 v6, v6, v2
+; GFX7-NEXT: v_add_f32_e32 v7, v7, v3
+; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6
+; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7
+; GFX7-NEXT: v_or_b32_e32 v6, v4, v5
+; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8
+; GFX7-NEXT: v_or_b32_e32 v5, v7, v4
+; GFX7-NEXT: v_mov_b32_e32 v8, v6
+; GFX7-NEXT: v_mov_b32_e32 v7, v5
+; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 offset:2044 glc
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: buffer_wbinvl1
+; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7
+; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7
+; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6
+; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
+; GFX7-NEXT: s_cbranch_execnz .LBB68_1
+; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX6-LABEL: global_agent_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT: s_mov_b32 s6, 0
+; GFX6-NEXT: s_mov_b32 s7, 0xf000
+; GFX6-NEXT: s_mov_b32 s4, s6
+; GFX6-NEXT: s_mov_b32 s5, s6
+; GFX6-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:2044
+; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v2
+; GFX6-NEXT: s_mov_b64 s[8:9], 0
+; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v3
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v5
+; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v5
+; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v3
+; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6
+; GFX6-NEXT: .LBB68_1: ; %atomicrmw.start
+; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5
+; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4
+; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v5
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v4
+; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX6-NEXT: v_add_f32_e32 v6, v6, v2
+; GFX6-NEXT: v_add_f32_e32 v7, v7, v3
+; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v6
+; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v7
+; GFX6-NEXT: v_or_b32_e32 v6, v4, v5
+; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v8
+; GFX6-NEXT: v_or_b32_e32 v5, v7, v4
+; GFX6-NEXT: v_mov_b32_e32 v8, v6
+; GFX6-NEXT: v_mov_b32_e32 v7, v5
+; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 offset:2044 glc
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: buffer_wbinvl1
+; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v7
+; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v7
+; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6
+; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
+; GFX6-NEXT: s_cbranch_execnz .LBB68_1
+; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: s_setpc_b64 s[30:31]
+ %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i64 511
+ %unused = atomicrmw fadd ptr addrspace(1) %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+ ret void
+}
+
+define void @global_agent_atomic_fadd_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x half> %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fadd_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: s_wait_storecnt 0x0
+; GFX12-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off offset:-2048
+; GFX12-NEXT: s_wait_storecnt 0x0
+; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: global_agent_atomic_fadd_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX940: ; %bb.0:
+; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT: buffer_wbl2 sc1
+; GFX940-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off offset:-2048
+; GFX940-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NEXT: buffer_inv sc1
+; GFX940-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: global_agent_atomic_fadd_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: global_load_b32 v4, v[0:1], off offset:-2048
+; GFX11-NEXT: s_mov_b32 s0, 0
+; GFX11-NEXT: .LBB69_1: ; %atomicrmw.start
+; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_pk_add_f16 v3, v4, v2
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:-2048 glc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: buffer_gl1_inv
+; GFX11-NEXT: buffer_gl0_inv
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT: s_cbranch_execnz .LBB69_1
+; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: global_agent_atomic_fadd_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:-2048
+; GFX10-NEXT: s_mov_b32 s4, 0
+; GFX10-NEXT: .LBB69_1: ; %atomicrmw.start
+; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_pk_add_f16 v3, v4, v2
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: buffer_gl1_inv
+; GFX10-NEXT: buffer_gl0_inv
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX10-NEXT: v_mov_b32_e32 v4, v3
+; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT: s_cbranch_execnz .LBB69_1
+; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: global_agent_atomic_fadd_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX90A: ; %bb.0:
+; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off offset:-2048
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: buffer_wbinvl1
+; GFX90A-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: global_agent_atomic_fadd_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX908: ; %bb.0:
+; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT: global_load_dword v4, v[0:1], off offset:-2048
+; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: .LBB69_1: ; %atomicrmw.start
+; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: v_pk_add_f16 v3, v4, v2
+; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-2048 glc
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: buffer_wbinvl1
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT: v_mov_b32_e32 v4, v3
+; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX908-NEXT: s_cbranch_execnz .LBB69_1
+; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: global_agent_atomic_fadd_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0xfffff800, v0
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc
+; GFX8-NEXT: flat_load_dword v4, v[0:1]
+; GFX8-NEXT: s_mov_b64 s[4:5], 0
+; GFX8-NEXT: .LBB69_1: ; %atomicrmw.start
+; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_add_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_add_f16_e32 v5, v4, v2
+; GFX8-NEXT: v_or_b32_e32 v3, v5, v3
+; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: buffer_wbinvl1
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT: v_mov_b32_e32 v4, v3
+; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX8-NEXT: s_cbranch_execnz .LBB69_1
+; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: global_agent_atomic_fadd_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: s_movk_i32 s4, 0xf800
+; GFX7-NEXT: s_mov_b32 s5, -1
+; GFX7-NEXT: s_mov_b32 s7, 0xf000
+; GFX7-NEXT: s_mov_b32 s6, 0
+; GFX7-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2
+; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0
+; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v3
+; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc
+; GFX7-NEXT: s_mov_b64 s[8:9], 0
+; GFX7-NEXT: s_mov_b32 s4, s6
+; GFX7-NEXT: s_mov_b32 s5, s6
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v5
+; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v5
+; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3
+; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6
+; GFX7-NEXT: .LBB69_1: ; %atomicrmw.start
+; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5
+; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4
+; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5
+; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX7-NEXT: v_add_f32_e32 v6, v6, v2
+; GFX7-NEXT: v_add_f32_e32 v7, v7, v3
+; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6
+; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7
+; GFX7-NEXT: v_or_b32_e32 v6, v4, v5
+; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8
+; GFX7-NEXT: v_or_b32_e32 v5, v7, v4
+; GFX7-NEXT: v_mov_b32_e32 v8, v6
+; GFX7-NEXT: v_mov_b32_e32 v7, v5
+; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: buffer_wbinvl1
+; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7
+; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7
+; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6
+; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
+; GFX7-NEXT: s_cbranch_execnz .LBB69_1
+; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX6-LABEL: global_agent_atomic_fadd_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT: s_movk_i32 s4, 0xf800
+; GFX6-NEXT: s_mov_b32 s5, -1
+; GFX6-NEXT: s_mov_b32 s7, 0xf000
+; GFX6-NEXT: s_mov_b32 s6, 0
+; GFX6-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64
+; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v2
+; GFX6-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0
+; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v3
+; GFX6-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc
+; GFX6-NEXT: s_mov_b64 s[8:9], 0
+; GFX6-NEXT: s_mov_b32 s4, s6
+; GFX6-NEXT: s_mov_b32 s5, s6
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v5
+; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v5
+; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v3
+; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6
+; GFX6-NEXT: .LBB69_1: ; %atomicrmw.start
+; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5
+; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4
+; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v5
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v4
+; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX6-NEXT: v_add_f32_e32 v6, v6, v2
+; GFX6-NEXT: v_add_f32_e32 v7, v7, v3
+; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v6
+; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v7
+; GFX6-NEXT: v_or_b32_e32 v6, v4, v5
+; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v8
+; GFX6-NEXT: v_or_b32_e32 v5, v7, v4
+; GFX6-NEXT: v_mov_b32_e32 v8, v6
+; GFX6-NEXT: v_mov_b32_e32 v7, v5
+; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: buffer_wbinvl1
+; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v7
+; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v7
+; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6
+; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
+; GFX6-NEXT: s_cbranch_execnz .LBB69_1
+; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: s_setpc_b64 s[30:31]
+ %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i64 -512
+ %unused = atomicrmw fadd ptr addrspace(1) %gep, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+ ret void
+}
+
+define <2 x half> @global_system_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x half> %val) #0 {
+; GFX12-LABEL: global_system_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: s_wait_storecnt 0x0
+; GFX12-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: global_inv scope:SCOPE_SYS
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: global_system_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX940: ; %bb.0:
+; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off offset:2044 sc0 sc1
+; GFX940-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NEXT: buffer_inv sc0 sc1
+; GFX940-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: global_system_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044
+; GFX11-NEXT: s_mov_b32 s0, 0
+; GFX11-NEXT: .LBB70_1: ; %atomicrmw.start
+; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_pk_add_f16 v3, v4, v2
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 glc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: buffer_gl1_inv
+; GFX11-NEXT: buffer_gl0_inv
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT: s_cbranch_execnz .LBB70_1
+; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT: v_mov_b32_e32 v0, v3
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: global_system_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044
+; GFX10-NEXT: s_mov_b32 s4, 0
+; GFX10-NEXT: .LBB70_1: ; %atomicrmw.start
+; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_mov_b32_e32 v4, v3
+; GFX10-NEXT: v_pk_add_f16 v3, v4, v2
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: buffer_gl1_inv
+; GFX10-NEXT: buffer_gl0_inv
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT: s_cbranch_execnz .LBB70_1
+; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT: v_mov_b32_e32 v0, v3
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: global_system_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX90A: ; %bb.0:
+; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: buffer_wbl2
+; GFX90A-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off offset:2044 glc
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: buffer_invl2
+; GFX90A-NEXT: buffer_wbinvl1
+; GFX90A-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: global_system_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX908: ; %bb.0:
+; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044
+; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: .LBB70_1: ; %atomicrmw.start
+; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: v_mov_b32_e32 v4, v3
+; GFX908-NEXT: v_pk_add_f16 v3, v4, v2
+; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: buffer_wbinvl1
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX908-NEXT: s_cbranch_execnz .LBB70_1
+; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT: v_mov_b32_e32 v0, v3
+; GFX908-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: global_system_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fc, v0
+; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc
+; GFX8-NEXT: flat_load_dword v0, v[3:4]
+; GFX8-NEXT: s_mov_b64 s[4:5], 0
+; GFX8-NEXT: .LBB70_1: ; %atomicrmw.start
+; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v1, v0
+; GFX8-NEXT: v_add_f16_sdwa v0, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_add_f16_e32 v5, v1, v2
+; GFX8-NEXT: v_or_b32_e32 v0, v5, v0
+; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: buffer_wbinvl1
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX8-NEXT: s_cbranch_execnz .LBB70_1
+; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: global_system_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: s_mov_b32 s6, 0
+; GFX7-NEXT: s_mov_b32 s7, 0xf000
+; GFX7-NEXT: s_mov_b32 s4, s6
+; GFX7-NEXT: s_mov_b32 s5, s6
+; GFX7-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:2044
+; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2
+; GFX7-NEXT: s_mov_b64 s[8:9], 0
+; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v5
+; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v5
+; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3
+; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v6
+; GFX7-NEXT: .LBB70_1: ; %atomicrmw.start
+; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v3
+; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v2
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_add_f32_e32 v6, v6, v4
+; GFX7-NEXT: v_add_f32_e32 v7, v7, v5
+; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6
+; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7
+; GFX7-NEXT: v_or_b32_e32 v7, v2, v3
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v6
+; GFX7-NEXT: v_or_b32_e32 v6, v8, v2
+; GFX7-NEXT: v_mov_b32_e32 v9, v7
+; GFX7-NEXT: v_mov_b32_e32 v8, v6
+; GFX7-NEXT: buffer_atomic_cmpswap v[8:9], v[0:1], s[4:7], 0 addr64 offset:2044 glc
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: buffer_wbinvl1
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v8
+; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v8
+; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7
+; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
+; GFX7-NEXT: s_cbranch_execnz .LBB70_1
+; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
+; GFX7-NEXT: v_mov_b32_e32 v0, v2
+; GFX7-NEXT: v_mov_b32_e32 v1, v3
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX6-LABEL: global_system_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT: s_mov_b32 s6, 0
+; GFX6-NEXT: s_mov_b32 s7, 0xf000
+; GFX6-NEXT: s_mov_b32 s4, s6
+; GFX6-NEXT: s_mov_b32 s5, s6
+; GFX6-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:2044
+; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v2
+; GFX6-NEXT: s_mov_b64 s[8:9], 0
+; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v3
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v5
+; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v5
+; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3
+; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v6
+; GFX6-NEXT: .LBB70_1: ; %atomicrmw.start
+; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v3
+; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v2
+; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX6-NEXT: v_add_f32_e32 v6, v6, v4
+; GFX6-NEXT: v_add_f32_e32 v7, v7, v5
+; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v7
+; GFX6-NEXT: v_or_b32_e32 v7, v2, v3
+; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v6
+; GFX6-NEXT: v_or_b32_e32 v6, v8, v2
+; GFX6-NEXT: v_mov_b32_e32 v9, v7
+; GFX6-NEXT: v_mov_b32_e32 v8, v6
+; GFX6-NEXT: buffer_atomic_cmpswap v[8:9], v[0:1], s[4:7], 0 addr64 offset:2044 glc
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: buffer_wbinvl1
+; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v8
+; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v8
+; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7
+; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
+; GFX6-NEXT: s_cbranch_execnz .LBB70_1
+; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
+; GFX6-NEXT: v_mov_b32_e32 v0, v2
+; GFX6-NEXT: v_mov_b32_e32 v1, v3
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: s_setpc_b64 s[30:31]
+ %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i64 511
+ %result = atomicrmw fadd ptr addrspace(1) %gep, <2 x half> %val seq_cst, !amdgpu.no.fine.grained.memory !0
+ ret <2 x half> %result
+}
+
+define void @global_system_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x half> %val) #0 {
+; GFX12-LABEL: global_system_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: s_wait_storecnt 0x0
+; GFX12-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off offset:2044
+; GFX12-NEXT: s_wait_storecnt 0x0
+; GFX12-NEXT: global_inv scope:SCOPE_SYS
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: global_system_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX940: ; %bb.0:
+; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off offset:2044 sc1
+; GFX940-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NEXT: buffer_inv sc0 sc1
+; GFX940-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: global_system_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: global_load_b32 v4, v[0:1], off offset:2044
+; GFX11-NEXT: s_mov_b32 s0, 0
+; GFX11-NEXT: .LBB71_1: ; %atomicrmw.start
+; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_pk_add_f16 v3, v4, v2
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2044 glc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: buffer_gl1_inv
+; GFX11-NEXT: buffer_gl0_inv
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT: s_cbranch_execnz .LBB71_1
+; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: global_system_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: global_load_dword v4, v[0:1], off offset:2044
+; GFX10-NEXT: s_mov_b32 s4, 0
+; GFX10-NEXT: .LBB71_1: ; %atomicrmw.start
+; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_pk_add_f16 v3, v4, v2
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: buffer_gl1_inv
+; GFX10-NEXT: buffer_gl0_inv
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX10-NEXT: v_mov_b32_e32 v4, v3
+; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT: s_cbranch_execnz .LBB71_1
+; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: global_system_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX90A: ; %bb.0:
+; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: buffer_wbl2
+; GFX90A-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off offset:2044
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: buffer_invl2
+; GFX90A-NEXT: buffer_wbinvl1
+; GFX90A-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: global_system_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX908: ; %bb.0:
+; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT: global_load_dword v4, v[0:1], off offset:2044
+; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: .LBB71_1: ; %atomicrmw.start
+; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: v_pk_add_f16 v3, v4, v2
+; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:2044 glc
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: buffer_wbinvl1
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT: v_mov_b32_e32 v4, v3
+; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX908-NEXT: s_cbranch_execnz .LBB71_1
+; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: global_system_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT: flat_load_dword v4, v[0:1]
+; GFX8-NEXT: s_mov_b64 s[4:5], 0
+; GFX8-NEXT: .LBB71_1: ; %atomicrmw.start
+; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_add_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_add_f16_e32 v5, v4, v2
+; GFX8-NEXT: v_or_b32_e32 v3, v5, v3
+; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: buffer_wbinvl1
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT: v_mov_b32_e32 v4, v3
+; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX8-NEXT: s_cbranch_execnz .LBB71_1
+; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: global_system_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: s_mov_b32 s6, 0
+; GFX7-NEXT: s_mov_b32 s7, 0xf000
+; GFX7-NEXT: s_mov_b32 s4, s6
+; GFX7-NEXT: s_mov_b32 s5, s6
+; GFX7-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:2044
+; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2
+; GFX7-NEXT: s_mov_b64 s[8:9], 0
+; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v3
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v5
+; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v5
+; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3
+; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6
+; GFX7-NEXT: .LBB71_1: ; %atomicrmw.start
+; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5
+; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4
+; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5
+; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX7-NEXT: v_add_f32_e32 v6, v6, v2
+; GFX7-NEXT: v_add_f32_e32 v7, v7, v3
+; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6
+; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7
+; GFX7-NEXT: v_or_b32_e32 v6, v4, v5
+; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8
+; GFX7-NEXT: v_or_b32_e32 v5, v7, v4
+; GFX7-NEXT: v_mov_b32_e32 v8, v6
+; GFX7-NEXT: v_mov_b32_e32 v7, v5
+; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 offset:2044 glc
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: buffer_wbinvl1
+; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7
+; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7
+; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6
+; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
+; GFX7-NEXT: s_cbranch_execnz .LBB71_1
+; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX6-LABEL: global_system_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT: s_mov_b32 s6, 0
+; GFX6-NEXT: s_mov_b32 s7, 0xf000
+; GFX6-NEXT: s_mov_b32 s4, s6
+; GFX6-NEXT: s_mov_b32 s5, s6
+; GFX6-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:2044
+; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v2
+; GFX6-NEXT: s_mov_b64 s[8:9], 0
+; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v3
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v5
+; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v5
+; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v3
+; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6
+; GFX6-NEXT: .LBB71_1: ; %atomicrmw.start
+; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5
+; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4
+; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v5
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v4
+; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX6-NEXT: v_add_f32_e32 v6, v6, v2
+; GFX6-NEXT: v_add_f32_e32 v7, v7, v3
+; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v6
+; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v7
+; GFX6-NEXT: v_or_b32_e32 v6, v4, v5
+; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v8
+; GFX6-NEXT: v_or_b32_e32 v5, v7, v4
+; GFX6-NEXT: v_mov_b32_e32 v8, v6
+; GFX6-NEXT: v_mov_b32_e32 v7, v5
+; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 offset:2044 glc
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: buffer_wbinvl1
+; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v7
+; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v7
+; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6
+; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
+; GFX6-NEXT: s_cbranch_execnz .LBB71_1
+; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: s_setpc_b64 s[30:31]
+ %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i64 511
+ %unused = atomicrmw fadd ptr addrspace(1) %gep, <2 x half> %val seq_cst, !amdgpu.no.fine.grained.memory !0
+ ret void
+}
+
+define <2 x half> @global_agent_atomic_fadd_ret_v2f16__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x half> %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fadd_ret_v2f16__amdgpu_no_remote_memory:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: s_wait_storecnt 0x0
+; GFX12-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: global_agent_atomic_fadd_ret_v2f16__amdgpu_no_remote_memory:
+; GFX940: ; %bb.0:
+; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT: buffer_wbl2 sc1
+; GFX940-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off sc0
+; GFX940-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NEXT: buffer_inv sc1
+; GFX940-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: global_agent_atomic_fadd_ret_v2f16__amdgpu_no_remote_memory:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: global_load_b32 v3, v[0:1], off
+; GFX11-NEXT: s_mov_b32 s0, 0
+; GFX11-NEXT: .LBB72_1: ; %atomicrmw.start
+; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_pk_add_f16 v3, v4, v2
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: buffer_gl1_inv
+; GFX11-NEXT: buffer_gl0_inv
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT: s_cbranch_execnz .LBB72_1
+; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT: v_mov_b32_e32 v0, v3
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: global_agent_atomic_fadd_ret_v2f16__amdgpu_no_remote_memory:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: global_load_dword v3, v[0:1], off
+; GFX10-NEXT: s_mov_b32 s4, 0
+; GFX10-NEXT: .LBB72_1: ; %atomicrmw.start
+; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_mov_b32_e32 v4, v3
+; GFX10-NEXT: v_pk_add_f16 v3, v4, v2
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: buffer_gl1_inv
+; GFX10-NEXT: buffer_gl0_inv
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT: s_cbranch_execnz .LBB72_1
+; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT: v_mov_b32_e32 v0, v3
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: global_agent_atomic_fadd_ret_v2f16__amdgpu_no_remote_memory:
+; GFX90A: ; %bb.0:
+; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off glc
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: buffer_wbinvl1
+; GFX90A-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: global_agent_atomic_fadd_ret_v2f16__amdgpu_no_remote_memory:
+; GFX908: ; %bb.0:
+; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT: global_load_dword v3, v[0:1], off
+; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: .LBB72_1: ; %atomicrmw.start
+; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: v_mov_b32_e32 v4, v3
+; GFX908-NEXT: v_pk_add_f16 v3, v4, v2
+; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: buffer_wbinvl1
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX908-NEXT: s_cbranch_execnz .LBB72_1
+; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT: v_mov_b32_e32 v0, v3
+; GFX908-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: global_agent_atomic_fadd_ret_v2f16__amdgpu_no_remote_memory:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: flat_load_dword v3, v[0:1]
+; GFX8-NEXT: s_mov_b64 s[4:5], 0
+; GFX8-NEXT: .LBB72_1: ; %atomicrmw.start
+; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v4, v3
+; GFX8-NEXT: v_add_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_add_f16_e32 v5, v4, v2
+; GFX8-NEXT: v_or_b32_e32 v3, v5, v3
+; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: buffer_wbinvl1
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX8-NEXT: s_cbranch_execnz .LBB72_1
+; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT: v_mov_b32_e32 v0, v3
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: global_agent_atomic_fadd_ret_v2f16__amdgpu_no_remote_memory:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: s_mov_b32 s6, 0
+; GFX7-NEXT: s_mov_b32 s7, 0xf000
+; GFX7-NEXT: s_mov_b32 s4, s6
+; GFX7-NEXT: s_mov_b32 s5, s6
+; GFX7-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2
+; GFX7-NEXT: s_mov_b64 s[8:9], 0
+; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v5
+; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v5
+; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3
+; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v6
+; GFX7-NEXT: .LBB72_1: ; %atomicrmw.start
+; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v3
+; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v2
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_add_f32_e32 v6, v6, v4
+; GFX7-NEXT: v_add_f32_e32 v7, v7, v5
+; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6
+; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7
+; GFX7-NEXT: v_or_b32_e32 v7, v2, v3
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v6
+; GFX7-NEXT: v_or_b32_e32 v6, v8, v2
+; GFX7-NEXT: v_mov_b32_e32 v9, v7
+; GFX7-NEXT: v_mov_b32_e32 v8, v6
+; GFX7-NEXT: buffer_atomic_cmpswap v[8:9], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: buffer_wbinvl1
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v8
+; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v8
+; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7
+; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
+; GFX7-NEXT: s_cbranch_execnz .LBB72_1
+; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
+; GFX7-NEXT: v_mov_b32_e32 v0, v2
+; GFX7-NEXT: v_mov_b32_e32 v1, v3
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX6-LABEL: global_agent_atomic_fadd_ret_v2f16__amdgpu_no_remote_memory:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT: s_mov_b32 s6, 0
+; GFX6-NEXT: s_mov_b32 s7, 0xf000
+; GFX6-NEXT: s_mov_b32 s4, s6
+; GFX6-NEXT: s_mov_b32 s5, s6
+; GFX6-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64
+; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v2
+; GFX6-NEXT: s_mov_b64 s[8:9], 0
+; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v3
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v5
+; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v5
+; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3
+; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v6
+; GFX6-NEXT: .LBB72_1: ; %atomicrmw.start
+; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v3
+; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v2
+; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX6-NEXT: v_add_f32_e32 v6, v6, v4
+; GFX6-NEXT: v_add_f32_e32 v7, v7, v5
+; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v7
+; GFX6-NEXT: v_or_b32_e32 v7, v2, v3
+; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v6
+; GFX6-NEXT: v_or_b32_e32 v6, v8, v2
+; GFX6-NEXT: v_mov_b32_e32 v9, v7
+; GFX6-NEXT: v_mov_b32_e32 v8, v6
+; GFX6-NEXT: buffer_atomic_cmpswap v[8:9], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: buffer_wbinvl1
+; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v8
+; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v8
+; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7
+; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
+; GFX6-NEXT: s_cbranch_execnz .LBB72_1
+; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
+; GFX6-NEXT: v_mov_b32_e32 v0, v2
+; GFX6-NEXT: v_mov_b32_e32 v1, v3
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: s_setpc_b64 s[30:31]
+ %result = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
+ ret <2 x half> %result
+}
+
+define void @global_agent_atomic_fadd_noret_v2f16__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x half> %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fadd_noret_v2f16__amdgpu_no_remote_memory:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: s_wait_storecnt 0x0
+; GFX12-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off
+; GFX12-NEXT: s_wait_storecnt 0x0
+; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: global_agent_atomic_fadd_noret_v2f16__amdgpu_no_remote_memory:
+; GFX940: ; %bb.0:
+; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT: buffer_wbl2 sc1
+; GFX940-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off
+; GFX940-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NEXT: buffer_inv sc1
+; GFX940-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: global_agent_atomic_fadd_noret_v2f16__amdgpu_no_remote_memory:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: global_load_b32 v4, v[0:1], off
+; GFX11-NEXT: s_mov_b32 s0, 0
+; GFX11-NEXT: .LBB73_1: ; %atomicrmw.start
+; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_pk_add_f16 v3, v4, v2
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: buffer_gl1_inv
+; GFX11-NEXT: buffer_gl0_inv
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT: s_cbranch_execnz .LBB73_1
+; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: global_agent_atomic_fadd_noret_v2f16__amdgpu_no_remote_memory:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: global_load_dword v4, v[0:1], off
+; GFX10-NEXT: s_mov_b32 s4, 0
+; GFX10-NEXT: .LBB73_1: ; %atomicrmw.start
+; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_pk_add_f16 v3, v4, v2
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: buffer_gl1_inv
+; GFX10-NEXT: buffer_gl0_inv
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX10-NEXT: v_mov_b32_e32 v4, v3
+; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT: s_cbranch_execnz .LBB73_1
+; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: global_agent_atomic_fadd_noret_v2f16__amdgpu_no_remote_memory:
+; GFX90A: ; %bb.0:
+; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: buffer_wbinvl1
+; GFX90A-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: global_agent_atomic_fadd_noret_v2f16__amdgpu_no_remote_memory:
+; GFX908: ; %bb.0:
+; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT: global_load_dword v4, v[0:1], off
+; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: .LBB73_1: ; %atomicrmw.start
+; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: v_pk_add_f16 v3, v4, v2
+; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: buffer_wbinvl1
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT: v_mov_b32_e32 v4, v3
+; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX908-NEXT: s_cbranch_execnz .LBB73_1
+; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: global_agent_atomic_fadd_noret_v2f16__amdgpu_no_remote_memory:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: flat_load_dword v4, v[0:1]
+; GFX8-NEXT: s_mov_b64 s[4:5], 0
+; GFX8-NEXT: .LBB73_1: ; %atomicrmw.start
+; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_add_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_add_f16_e32 v5, v4, v2
+; GFX8-NEXT: v_or_b32_e32 v3, v5, v3
+; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: buffer_wbinvl1
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT: v_mov_b32_e32 v4, v3
+; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX8-NEXT: s_cbranch_execnz .LBB73_1
+; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: global_agent_atomic_fadd_noret_v2f16__amdgpu_no_remote_memory:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: s_mov_b32 s6, 0
+; GFX7-NEXT: s_mov_b32 s7, 0xf000
+; GFX7-NEXT: s_mov_b32 s4, s6
+; GFX7-NEXT: s_mov_b32 s5, s6
+; GFX7-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2
+; GFX7-NEXT: s_mov_b64 s[8:9], 0
+; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v3
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v5
+; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v5
+; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3
+; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6
+; GFX7-NEXT: .LBB73_1: ; %atomicrmw.start
+; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5
+; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4
+; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5
+; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX7-NEXT: v_add_f32_e32 v6, v6, v2
+; GFX7-NEXT: v_add_f32_e32 v7, v7, v3
+; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6
+; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7
+; GFX7-NEXT: v_or_b32_e32 v6, v4, v5
+; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8
+; GFX7-NEXT: v_or_b32_e32 v5, v7, v4
+; GFX7-NEXT: v_mov_b32_e32 v8, v6
+; GFX7-NEXT: v_mov_b32_e32 v7, v5
+; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: buffer_wbinvl1
+; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7
+; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7
+; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6
+; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
+; GFX7-NEXT: s_cbranch_execnz .LBB73_1
+; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX6-LABEL: global_agent_atomic_fadd_noret_v2f16__amdgpu_no_remote_memory:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT: s_mov_b32 s6, 0
+; GFX6-NEXT: s_mov_b32 s7, 0xf000
+; GFX6-NEXT: s_mov_b32 s4, s6
+; GFX6-NEXT: s_mov_b32 s5, s6
+; GFX6-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64
+; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v2
+; GFX6-NEXT: s_mov_b64 s[8:9], 0
+; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v3
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v5
+; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v5
+; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v3
+; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6
+; GFX6-NEXT: .LBB73_1: ; %atomicrmw.start
+; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5
+; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4
+; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v5
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v4
+; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX6-NEXT: v_add_f32_e32 v6, v6, v2
+; GFX6-NEXT: v_add_f32_e32 v7, v7, v3
+; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v6
+; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v7
+; GFX6-NEXT: v_or_b32_e32 v6, v4, v5
+; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v8
+; GFX6-NEXT: v_or_b32_e32 v5, v7, v4
+; GFX6-NEXT: v_mov_b32_e32 v8, v6
+; GFX6-NEXT: v_mov_b32_e32 v7, v5
+; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: buffer_wbinvl1
+; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v7
+; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v7
+; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6
+; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
+; GFX6-NEXT: s_cbranch_execnz .LBB73_1
+; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: s_setpc_b64 s[30:31]
+ %unused = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
+ ret void
+}
+
+define <2 x half> @global_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x half> %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: s_wait_storecnt 0x0
+; GFX12-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: global_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
+; GFX940: ; %bb.0:
+; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT: buffer_wbl2 sc1
+; GFX940-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off sc0
+; GFX940-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NEXT: buffer_inv sc1
+; GFX940-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: global_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: global_load_b32 v3, v[0:1], off
+; GFX11-NEXT: s_mov_b32 s0, 0
+; GFX11-NEXT: .LBB74_1: ; %atomicrmw.start
+; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_pk_add_f16 v3, v4, v2
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: buffer_gl1_inv
+; GFX11-NEXT: buffer_gl0_inv
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT: s_cbranch_execnz .LBB74_1
+; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT: v_mov_b32_e32 v0, v3
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: global_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: global_load_dword v3, v[0:1], off
+; GFX10-NEXT: s_mov_b32 s4, 0
+; GFX10-NEXT: .LBB74_1: ; %atomicrmw.start
+; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_mov_b32_e32 v4, v3
+; GFX10-NEXT: v_pk_add_f16 v3, v4, v2
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: buffer_gl1_inv
+; GFX10-NEXT: buffer_gl0_inv
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT: s_cbranch_execnz .LBB74_1
+; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT: v_mov_b32_e32 v0, v3
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: global_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
+; GFX90A: ; %bb.0:
+; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off glc
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: buffer_wbinvl1
+; GFX90A-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: global_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
+; GFX908: ; %bb.0:
+; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT: global_load_dword v3, v[0:1], off
+; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: .LBB74_1: ; %atomicrmw.start
+; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: v_mov_b32_e32 v4, v3
+; GFX908-NEXT: v_pk_add_f16 v3, v4, v2
+; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: buffer_wbinvl1
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX908-NEXT: s_cbranch_execnz .LBB74_1
+; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT: v_mov_b32_e32 v0, v3
+; GFX908-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: global_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: flat_load_dword v3, v[0:1]
+; GFX8-NEXT: s_mov_b64 s[4:5], 0
+; GFX8-NEXT: .LBB74_1: ; %atomicrmw.start
+; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v4, v3
+; GFX8-NEXT: v_add_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_add_f16_e32 v5, v4, v2
+; GFX8-NEXT: v_or_b32_e32 v3, v5, v3
+; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: buffer_wbinvl1
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX8-NEXT: s_cbranch_execnz .LBB74_1
+; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT: v_mov_b32_e32 v0, v3
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: global_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: s_mov_b32 s6, 0
+; GFX7-NEXT: s_mov_b32 s7, 0xf000
+; GFX7-NEXT: s_mov_b32 s4, s6
+; GFX7-NEXT: s_mov_b32 s5, s6
+; GFX7-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2
+; GFX7-NEXT: s_mov_b64 s[8:9], 0
+; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v5
+; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v5
+; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3
+; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v6
+; GFX7-NEXT: .LBB74_1: ; %atomicrmw.start
+; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v3
+; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v2
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_add_f32_e32 v6, v6, v4
+; GFX7-NEXT: v_add_f32_e32 v7, v7, v5
+; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6
+; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7
+; GFX7-NEXT: v_or_b32_e32 v7, v2, v3
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v6
+; GFX7-NEXT: v_or_b32_e32 v6, v8, v2
+; GFX7-NEXT: v_mov_b32_e32 v9, v7
+; GFX7-NEXT: v_mov_b32_e32 v8, v6
+; GFX7-NEXT: buffer_atomic_cmpswap v[8:9], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: buffer_wbinvl1
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v8
+; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v8
+; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7
+; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
+; GFX7-NEXT: s_cbranch_execnz .LBB74_1
+; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
+; GFX7-NEXT: v_mov_b32_e32 v0, v2
+; GFX7-NEXT: v_mov_b32_e32 v1, v3
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX6-LABEL: global_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT: s_mov_b32 s6, 0
+; GFX6-NEXT: s_mov_b32 s7, 0xf000
+; GFX6-NEXT: s_mov_b32 s4, s6
+; GFX6-NEXT: s_mov_b32 s5, s6
+; GFX6-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64
+; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v2
+; GFX6-NEXT: s_mov_b64 s[8:9], 0
+; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v3
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v5
+; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v5
+; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3
+; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v6
+; GFX6-NEXT: .LBB74_1: ; %atomicrmw.start
+; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v3
+; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v2
+; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX6-NEXT: v_add_f32_e32 v6, v6, v4
+; GFX6-NEXT: v_add_f32_e32 v7, v7, v5
+; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v7
+; GFX6-NEXT: v_or_b32_e32 v7, v2, v3
+; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v6
+; GFX6-NEXT: v_or_b32_e32 v6, v8, v2
+; GFX6-NEXT: v_mov_b32_e32 v9, v7
+; GFX6-NEXT: v_mov_b32_e32 v8, v6
+; GFX6-NEXT: buffer_atomic_cmpswap v[8:9], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: buffer_wbinvl1
+; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v8
+; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v8
+; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7
+; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
+; GFX6-NEXT: s_cbranch_execnz .LBB74_1
+; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
+; GFX6-NEXT: v_mov_b32_e32 v0, v2
+; GFX6-NEXT: v_mov_b32_e32 v1, v3
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: s_setpc_b64 s[30:31]
+ %result = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
+ ret <2 x half> %result
+}
+
+define void @global_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x half> %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: s_wait_storecnt 0x0
+; GFX12-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off
+; GFX12-NEXT: s_wait_storecnt 0x0
+; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: global_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
+; GFX940: ; %bb.0:
+; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT: buffer_wbl2 sc1
+; GFX940-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off
+; GFX940-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NEXT: buffer_inv sc1
+; GFX940-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: global_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: global_load_b32 v4, v[0:1], off
+; GFX11-NEXT: s_mov_b32 s0, 0
+; GFX11-NEXT: .LBB75_1: ; %atomicrmw.start
+; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_pk_add_f16 v3, v4, v2
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: buffer_gl1_inv
+; GFX11-NEXT: buffer_gl0_inv
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT: s_cbranch_execnz .LBB75_1
+; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: global_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: global_load_dword v4, v[0:1], off
+; GFX10-NEXT: s_mov_b32 s4, 0
+; GFX10-NEXT: .LBB75_1: ; %atomicrmw.start
+; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_pk_add_f16 v3, v4, v2
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: buffer_gl1_inv
+; GFX10-NEXT: buffer_gl0_inv
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX10-NEXT: v_mov_b32_e32 v4, v3
+; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT: s_cbranch_execnz .LBB75_1
+; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: global_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
+; GFX90A: ; %bb.0:
+; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: buffer_wbinvl1
+; GFX90A-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: global_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
+; GFX908: ; %bb.0:
+; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT: global_load_dword v4, v[0:1], off
+; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: .LBB75_1: ; %atomicrmw.start
+; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: v_pk_add_f16 v3, v4, v2
+; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: buffer_wbinvl1
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT: v_mov_b32_e32 v4, v3
+; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX908-NEXT: s_cbranch_execnz .LBB75_1
+; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: global_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: flat_load_dword v4, v[0:1]
+; GFX8-NEXT: s_mov_b64 s[4:5], 0
+; GFX8-NEXT: .LBB75_1: ; %atomicrmw.start
+; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_add_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_add_f16_e32 v5, v4, v2
+; GFX8-NEXT: v_or_b32_e32 v3, v5, v3
+; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: buffer_wbinvl1
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT: v_mov_b32_e32 v4, v3
+; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX8-NEXT: s_cbranch_execnz .LBB75_1
+; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: global_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: s_mov_b32 s6, 0
+; GFX7-NEXT: s_mov_b32 s7, 0xf000
+; GFX7-NEXT: s_mov_b32 s4, s6
+; GFX7-NEXT: s_mov_b32 s5, s6
+; GFX7-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2
+; GFX7-NEXT: s_mov_b64 s[8:9], 0
+; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v3
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v5
+; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v5
+; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3
+; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6
+; GFX7-NEXT: .LBB75_1: ; %atomicrmw.start
+; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5
+; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4
+; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5
+; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX7-NEXT: v_add_f32_e32 v6, v6, v2
+; GFX7-NEXT: v_add_f32_e32 v7, v7, v3
+; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6
+; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7
+; GFX7-NEXT: v_or_b32_e32 v6, v4, v5
+; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8
+; GFX7-NEXT: v_or_b32_e32 v5, v7, v4
+; GFX7-NEXT: v_mov_b32_e32 v8, v6
+; GFX7-NEXT: v_mov_b32_e32 v7, v5
+; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: buffer_wbinvl1
+; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7
+; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7
+; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6
+; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
+; GFX7-NEXT: s_cbranch_execnz .LBB75_1
+; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX6-LABEL: global_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT: s_mov_b32 s6, 0
+; GFX6-NEXT: s_mov_b32 s7, 0xf000
+; GFX6-NEXT: s_mov_b32 s4, s6
+; GFX6-NEXT: s_mov_b32 s5, s6
+; GFX6-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64
+; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v2
+; GFX6-NEXT: s_mov_b64 s[8:9], 0
+; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v3
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v5
+; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v5
+; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v3
+; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6
+; GFX6-NEXT: .LBB75_1: ; %atomicrmw.start
+; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5
+; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4
+; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v5
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v4
+; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX6-NEXT: v_add_f32_e32 v6, v6, v2
+; GFX6-NEXT: v_add_f32_e32 v7, v7, v3
+; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v6
+; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v7
+; GFX6-NEXT: v_or_b32_e32 v6, v4, v5
+; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v8
+; GFX6-NEXT: v_or_b32_e32 v5, v7, v4
+; GFX6-NEXT: v_mov_b32_e32 v8, v6
+; GFX6-NEXT: v_mov_b32_e32 v7, v5
+; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: buffer_wbinvl1
+; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v7
+; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v7
+; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6
+; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
+; GFX6-NEXT: s_cbranch_execnz .LBB75_1
+; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: s_setpc_b64 s[30:31]
+ %unused = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
+ ret void
+}
+
+define <2 x half> @global_agent_atomic_fadd_ret_v2f16__maybe_remote(ptr addrspace(1) %ptr, <2 x half> %val) {
+; GFX12-LABEL: global_agent_atomic_fadd_ret_v2f16__maybe_remote:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: s_wait_storecnt 0x0
+; GFX12-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: global_agent_atomic_fadd_ret_v2f16__maybe_remote:
+; GFX940: ; %bb.0:
+; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT: buffer_wbl2 sc1
+; GFX940-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off sc0
+; GFX940-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NEXT: buffer_inv sc1
+; GFX940-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: global_agent_atomic_fadd_ret_v2f16__maybe_remote:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: global_load_b32 v3, v[0:1], off
+; GFX11-NEXT: s_mov_b32 s0, 0
+; GFX11-NEXT: .LBB76_1: ; %atomicrmw.start
+; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_pk_add_f16 v3, v4, v2
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: buffer_gl1_inv
+; GFX11-NEXT: buffer_gl0_inv
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT: s_cbranch_execnz .LBB76_1
+; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT: v_mov_b32_e32 v0, v3
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: global_agent_atomic_fadd_ret_v2f16__maybe_remote:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: global_load_dword v3, v[0:1], off
+; GFX10-NEXT: s_mov_b32 s4, 0
+; GFX10-NEXT: .LBB76_1: ; %atomicrmw.start
+; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_mov_b32_e32 v4, v3
+; GFX10-NEXT: v_pk_add_f16 v3, v4, v2
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: buffer_gl1_inv
+; GFX10-NEXT: buffer_gl0_inv
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT: s_cbranch_execnz .LBB76_1
+; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT: v_mov_b32_e32 v0, v3
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: global_agent_atomic_fadd_ret_v2f16__maybe_remote:
+; GFX90A: ; %bb.0:
+; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off glc
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: buffer_wbinvl1
+; GFX90A-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: global_agent_atomic_fadd_ret_v2f16__maybe_remote:
+; GFX908: ; %bb.0:
+; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT: global_load_dword v3, v[0:1], off
+; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: .LBB76_1: ; %atomicrmw.start
+; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: v_mov_b32_e32 v4, v3
+; GFX908-NEXT: v_pk_add_f16 v3, v4, v2
+; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: buffer_wbinvl1
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX908-NEXT: s_cbranch_execnz .LBB76_1
+; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT: v_mov_b32_e32 v0, v3
+; GFX908-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: global_agent_atomic_fadd_ret_v2f16__maybe_remote:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: flat_load_dword v3, v[0:1]
+; GFX8-NEXT: s_mov_b64 s[4:5], 0
+; GFX8-NEXT: .LBB76_1: ; %atomicrmw.start
+; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v4, v3
+; GFX8-NEXT: v_add_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_add_f16_e32 v5, v4, v2
+; GFX8-NEXT: v_or_b32_e32 v3, v5, v3
+; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: buffer_wbinvl1
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX8-NEXT: s_cbranch_execnz .LBB76_1
+; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT: v_mov_b32_e32 v0, v3
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: global_agent_atomic_fadd_ret_v2f16__maybe_remote:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: s_mov_b32 s6, 0
+; GFX7-NEXT: s_mov_b32 s7, 0xf000
+; GFX7-NEXT: s_mov_b32 s4, s6
+; GFX7-NEXT: s_mov_b32 s5, s6
+; GFX7-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2
+; GFX7-NEXT: s_mov_b64 s[8:9], 0
+; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v3
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v5
+; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v5
+; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3
+; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v6
+; GFX7-NEXT: .LBB76_1: ; %atomicrmw.start
+; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v3
+; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v2
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_add_f32_e32 v6, v6, v4
+; GFX7-NEXT: v_add_f32_e32 v7, v7, v5
+; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6
+; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v7
+; GFX7-NEXT: v_or_b32_e32 v7, v2, v3
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v6
+; GFX7-NEXT: v_or_b32_e32 v6, v8, v2
+; GFX7-NEXT: v_mov_b32_e32 v9, v7
+; GFX7-NEXT: v_mov_b32_e32 v8, v6
+; GFX7-NEXT: buffer_atomic_cmpswap v[8:9], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: buffer_wbinvl1
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v8
+; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v8
+; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7
+; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
+; GFX7-NEXT: s_cbranch_execnz .LBB76_1
+; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
+; GFX7-NEXT: v_mov_b32_e32 v0, v2
+; GFX7-NEXT: v_mov_b32_e32 v1, v3
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX6-LABEL: global_agent_atomic_fadd_ret_v2f16__maybe_remote:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT: s_mov_b32 s6, 0
+; GFX6-NEXT: s_mov_b32 s7, 0xf000
+; GFX6-NEXT: s_mov_b32 s4, s6
+; GFX6-NEXT: s_mov_b32 s5, s6
+; GFX6-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64
+; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v2
+; GFX6-NEXT: s_mov_b64 s[8:9], 0
+; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v3
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v5
+; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v5
+; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3
+; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v6
+; GFX6-NEXT: .LBB76_1: ; %atomicrmw.start
+; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v3
+; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v2
+; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX6-NEXT: v_add_f32_e32 v6, v6, v4
+; GFX6-NEXT: v_add_f32_e32 v7, v7, v5
+; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v7
+; GFX6-NEXT: v_or_b32_e32 v7, v2, v3
+; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v6
+; GFX6-NEXT: v_or_b32_e32 v6, v8, v2
+; GFX6-NEXT: v_mov_b32_e32 v9, v7
+; GFX6-NEXT: v_mov_b32_e32 v8, v6
+; GFX6-NEXT: buffer_atomic_cmpswap v[8:9], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: buffer_wbinvl1
+; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v8
+; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v8
+; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7
+; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
+; GFX6-NEXT: s_cbranch_execnz .LBB76_1
+; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
+; GFX6-NEXT: v_mov_b32_e32 v0, v2
+; GFX6-NEXT: v_mov_b32_e32 v1, v3
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: s_setpc_b64 s[30:31]
+ %result = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %val syncscope("agent") seq_cst
+ ret <2 x half> %result
+}
+
+define void @global_agent_atomic_fadd_noret_v2f16__maybe_remote(ptr addrspace(1) %ptr, <2 x half> %val) {
+; GFX12-LABEL: global_agent_atomic_fadd_noret_v2f16__maybe_remote:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: s_wait_storecnt 0x0
+; GFX12-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off
+; GFX12-NEXT: s_wait_storecnt 0x0
+; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: global_agent_atomic_fadd_noret_v2f16__maybe_remote:
+; GFX940: ; %bb.0:
+; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT: buffer_wbl2 sc1
+; GFX940-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off
+; GFX940-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NEXT: buffer_inv sc1
+; GFX940-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: global_agent_atomic_fadd_noret_v2f16__maybe_remote:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: global_load_b32 v4, v[0:1], off
+; GFX11-NEXT: s_mov_b32 s0, 0
+; GFX11-NEXT: .LBB77_1: ; %atomicrmw.start
+; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_pk_add_f16 v3, v4, v2
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: buffer_gl1_inv
+; GFX11-NEXT: buffer_gl0_inv
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT: s_cbranch_execnz .LBB77_1
+; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: global_agent_atomic_fadd_noret_v2f16__maybe_remote:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: global_load_dword v4, v[0:1], off
+; GFX10-NEXT: s_mov_b32 s4, 0
+; GFX10-NEXT: .LBB77_1: ; %atomicrmw.start
+; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_pk_add_f16 v3, v4, v2
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: buffer_gl1_inv
+; GFX10-NEXT: buffer_gl0_inv
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX10-NEXT: v_mov_b32_e32 v4, v3
+; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT: s_cbranch_execnz .LBB77_1
+; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: global_agent_atomic_fadd_noret_v2f16__maybe_remote:
+; GFX90A: ; %bb.0:
+; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: buffer_wbinvl1
+; GFX90A-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: global_agent_atomic_fadd_noret_v2f16__maybe_remote:
+; GFX908: ; %bb.0:
+; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT: global_load_dword v4, v[0:1], off
+; GFX908-NEXT: s_mov_b64 s[4:5], 0
+; GFX908-NEXT: .LBB77_1: ; %atomicrmw.start
+; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: v_pk_add_f16 v3, v4, v2
+; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: buffer_wbinvl1
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT: v_mov_b32_e32 v4, v3
+; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX908-NEXT: s_cbranch_execnz .LBB77_1
+; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: global_agent_atomic_fadd_noret_v2f16__maybe_remote:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: flat_load_dword v4, v[0:1]
+; GFX8-NEXT: s_mov_b64 s[4:5], 0
+; GFX8-NEXT: .LBB77_1: ; %atomicrmw.start
+; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_add_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT: v_add_f16_e32 v5, v4, v2
+; GFX8-NEXT: v_or_b32_e32 v3, v5, v3
+; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: buffer_wbinvl1
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT: v_mov_b32_e32 v4, v3
+; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX8-NEXT: s_cbranch_execnz .LBB77_1
+; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: global_agent_atomic_fadd_noret_v2f16__maybe_remote:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: s_mov_b32 s6, 0
+; GFX7-NEXT: s_mov_b32 s7, 0xf000
+; GFX7-NEXT: s_mov_b32 s4, s6
+; GFX7-NEXT: s_mov_b32 s5, s6
+; GFX7-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2
+; GFX7-NEXT: s_mov_b64 s[8:9], 0
+; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v3
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v5
+; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v5
+; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3
+; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6
+; GFX7-NEXT: .LBB77_1: ; %atomicrmw.start
+; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5
+; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4
+; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5
+; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX7-NEXT: v_add_f32_e32 v6, v6, v2
+; GFX7-NEXT: v_add_f32_e32 v7, v7, v3
+; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6
+; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7
+; GFX7-NEXT: v_or_b32_e32 v6, v4, v5
+; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8
+; GFX7-NEXT: v_or_b32_e32 v5, v7, v4
+; GFX7-NEXT: v_mov_b32_e32 v8, v6
+; GFX7-NEXT: v_mov_b32_e32 v7, v5
+; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: buffer_wbinvl1
+; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7
+; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7
+; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6
+; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
+; GFX7-NEXT: s_cbranch_execnz .LBB77_1
+; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX6-LABEL: global_agent_atomic_fadd_noret_v2f16__maybe_remote:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT: s_mov_b32 s6, 0
+; GFX6-NEXT: s_mov_b32 s7, 0xf000
+; GFX6-NEXT: s_mov_b32 s4, s6
+; GFX6-NEXT: s_mov_b32 s5, s6
+; GFX6-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64
+; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v2
+; GFX6-NEXT: s_mov_b64 s[8:9], 0
+; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v3
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v5
+; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v5
+; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v3
+; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6
+; GFX6-NEXT: .LBB77_1: ; %atomicrmw.start
+; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5
+; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4
+; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v5
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v4
+; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX6-NEXT: v_add_f32_e32 v6, v6, v2
+; GFX6-NEXT: v_add_f32_e32 v7, v7, v3
+; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v6
+; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v7
+; GFX6-NEXT: v_or_b32_e32 v6, v4, v5
+; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v8
+; GFX6-NEXT: v_or_b32_e32 v5, v7, v4
+; GFX6-NEXT: v_mov_b32_e32 v8, v6
+; GFX6-NEXT: v_mov_b32_e32 v7, v5
+; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: buffer_wbinvl1
+; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v7
+; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v7
+; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6
+; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
+; GFX6-NEXT: s_cbranch_execnz .LBB77_1
+; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: s_setpc_b64 s[30:31]
+ %unused = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %val syncscope("agent") seq_cst
+ ret void
+}
+
+; --------------------------------------------------------------------
+; <2 x bfloat>
+; --------------------------------------------------------------------
+
+define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_memory:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: s_wait_storecnt 0x0
+; GFX12-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_memory:
+; GFX940: ; %bb.0:
+; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT: buffer_wbl2 sc1
+; GFX940-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off sc0
+; GFX940-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NEXT: buffer_inv sc1
+; GFX940-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_memory:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: global_load_b32 v3, v[0:1], off
+; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11-NEXT: s_mov_b32 s1, 0
+; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX11-NEXT: .p2align 6
+; GFX11-NEXT: .LBB78_1: ; %atomicrmw.start
+; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_mov_b32_e32 v6, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX11-NEXT: v_add_f32_e32 v5, v5, v2
+; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX11-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
+; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX11-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0
+; GFX11-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off glc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: buffer_gl1_inv
+; GFX11-NEXT: buffer_gl0_inv
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
+; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX11-NEXT: s_cbranch_execnz .LBB78_1
+; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX11-NEXT: v_mov_b32_e32 v0, v3
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_memory:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: global_load_dword v3, v[0:1], off
+; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX10-NEXT: s_mov_b32 s5, 0
+; GFX10-NEXT: .LBB78_1: ; %atomicrmw.start
+; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_mov_b32_e32 v6, v3
+; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX10-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX10-NEXT: v_add_f32_e32 v5, v5, v2
+; GFX10-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX10-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX10-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX10-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
+; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3
+; GFX10-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e64 v3, v7, v9, s4
+; GFX10-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off glc
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: buffer_gl1_inv
+; GFX10-NEXT: buffer_gl0_inv
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
+; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5
+; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5
+; GFX10-NEXT: s_cbranch_execnz .LBB78_1
+; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5
+; GFX10-NEXT: v_mov_b32_e32 v0, v3
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_memory:
+; GFX90A: ; %bb.0:
+; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: global_load_dword v3, v[0:1], off
+; GFX90A-NEXT: s_mov_b64 s[6:7], 0
+; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
+; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
+; GFX90A-NEXT: .LBB78_1: ; %atomicrmw.start
+; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v7, v3
+; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7
+; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7
+; GFX90A-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX90A-NEXT: v_add_f32_e32 v5, v5, v2
+; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1
+; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3
+; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8
+; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
+; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5]
+; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9
+; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off glc
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: buffer_wbinvl1
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7
+; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
+; GFX90A-NEXT: s_cbranch_execnz .LBB78_1
+; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v3
+; GFX90A-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_memory:
+; GFX908: ; %bb.0:
+; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT: global_load_dword v3, v[0:1], off
+; GFX908-NEXT: s_mov_b64 s[6:7], 0
+; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX908-NEXT: s_movk_i32 s8, 0x7fff
+; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX908-NEXT: s_mov_b32 s9, 0x7060302
+; GFX908-NEXT: .LBB78_1: ; %atomicrmw.start
+; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: v_mov_b32_e32 v6, v3
+; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX908-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX908-NEXT: v_add_f32_e32 v5, v5, v2
+; GFX908-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX908-NEXT: v_bfe_u32 v9, v5, 16, 1
+; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v3
+; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX908-NEXT: v_add3_u32 v7, v7, v3, s8
+; GFX908-NEXT: v_add3_u32 v9, v9, v5, s8
+; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
+; GFX908-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5]
+; GFX908-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX908-NEXT: v_perm_b32 v5, v5, v3, s9
+; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off glc
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: buffer_wbinvl1
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6
+; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7]
+; GFX908-NEXT: s_cbranch_execnz .LBB78_1
+; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX908-NEXT: v_mov_b32_e32 v0, v3
+; GFX908-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_memory:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: flat_load_dword v3, v[0:1]
+; GFX8-NEXT: s_mov_b64 s[6:7], 0
+; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX8-NEXT: .LBB78_1: ; %atomicrmw.start
+; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v6, v3
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX8-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX8-NEXT: v_add_f32_e32 v5, v5, v2
+; GFX8-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v3
+; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7
+; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9
+; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v3
+; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
+; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5]
+; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX8-NEXT: v_alignbit_b32 v5, v5, v3, 16
+; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[5:6] glc
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: buffer_wbinvl1
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6
+; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7]
+; GFX8-NEXT: s_cbranch_execnz .LBB78_1
+; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX8-NEXT: v_mov_b32_e32 v0, v3
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_memory:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: s_mov_b32 s6, 0
+; GFX7-NEXT: s_mov_b32 s7, 0xf000
+; GFX7-NEXT: s_mov_b32 s4, s6
+; GFX7-NEXT: s_mov_b32 s5, s6
+; GFX7-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v3
+; GFX7-NEXT: s_mov_b64 s[8:9], 0
+; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v5
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5
+; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX7-NEXT: .LBB78_1: ; %atomicrmw.start
+; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v2
+; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX7-NEXT: v_add_f32_e32 v7, v7, v5
+; GFX7-NEXT: v_add_f32_e32 v6, v6, v4
+; GFX7-NEXT: v_alignbit_b32 v3, v2, v3, 16
+; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v7
+; GFX7-NEXT: v_alignbit_b32 v2, v2, v6, 16
+; GFX7-NEXT: v_mov_b32_e32 v7, v3
+; GFX7-NEXT: v_mov_b32_e32 v6, v2
+; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: buffer_wbinvl1
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v3
+; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v6
+; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
+; GFX7-NEXT: s_cbranch_execnz .LBB78_1
+; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
+; GFX7-NEXT: v_mov_b32_e32 v0, v3
+; GFX7-NEXT: v_mov_b32_e32 v1, v2
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX6-LABEL: global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_memory:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT: s_mov_b32 s6, 0
+; GFX6-NEXT: s_mov_b32 s7, 0xf000
+; GFX6-NEXT: s_mov_b32 s4, s6
+; GFX6-NEXT: s_mov_b32 s5, s6
+; GFX6-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64
+; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GFX6-NEXT: v_mul_f32_e32 v6, 1.0, v3
+; GFX6-NEXT: s_mov_b64 s[8:9], 0
+; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v5
+; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5
+; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX6-NEXT: .LBB78_1: ; %atomicrmw.start
+; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_and_b32_e32 v7, 0xffff0000, v2
+; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX6-NEXT: v_add_f32_e32 v7, v7, v5
+; GFX6-NEXT: v_add_f32_e32 v6, v6, v4
+; GFX6-NEXT: v_alignbit_b32 v3, v2, v3, 16
+; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v7
+; GFX6-NEXT: v_alignbit_b32 v2, v2, v6, 16
+; GFX6-NEXT: v_mov_b32_e32 v7, v3
+; GFX6-NEXT: v_mov_b32_e32 v6, v2
+; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: buffer_wbinvl1
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v3
+; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v6
+; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
+; GFX6-NEXT: s_cbranch_execnz .LBB78_1
+; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
+; GFX6-NEXT: v_mov_b32_e32 v0, v3
+; GFX6-NEXT: v_mov_b32_e32 v1, v2
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: s_setpc_b64 s[30:31]
+ %result = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+ ret <2 x bfloat> %result
+}
+
+define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: s_wait_storecnt 0x0
+; GFX12-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: global_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX940: ; %bb.0:
+; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT: buffer_wbl2 sc1
+; GFX940-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off offset:2044 sc0
+; GFX940-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NEXT: buffer_inv sc1
+; GFX940-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: global_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044
+; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11-NEXT: s_mov_b32 s1, 0
+; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX11-NEXT: .p2align 6
+; GFX11-NEXT: .LBB79_1: ; %atomicrmw.start
+; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_mov_b32_e32 v6, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX11-NEXT: v_add_f32_e32 v5, v5, v2
+; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX11-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
+; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX11-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0
+; GFX11-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:2044 glc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: buffer_gl1_inv
+; GFX11-NEXT: buffer_gl0_inv
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
+; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX11-NEXT: s_cbranch_execnz .LBB79_1
+; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX11-NEXT: v_mov_b32_e32 v0, v3
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: global_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044
+; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX10-NEXT: s_mov_b32 s5, 0
+; GFX10-NEXT: .LBB79_1: ; %atomicrmw.start
+; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_mov_b32_e32 v6, v3
+; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX10-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX10-NEXT: v_add_f32_e32 v5, v5, v2
+; GFX10-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX10-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX10-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX10-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
+; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3
+; GFX10-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e64 v3, v7, v9, s4
+; GFX10-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off offset:2044 glc
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: buffer_gl1_inv
+; GFX10-NEXT: buffer_gl0_inv
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
+; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5
+; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5
+; GFX10-NEXT: s_cbranch_execnz .LBB79_1
+; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5
+; GFX10-NEXT: v_mov_b32_e32 v0, v3
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: global_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX90A: ; %bb.0:
+; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044
+; GFX90A-NEXT: s_mov_b64 s[6:7], 0
+; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
+; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
+; GFX90A-NEXT: .LBB79_1: ; %atomicrmw.start
+; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v7, v3
+; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7
+; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7
+; GFX90A-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX90A-NEXT: v_add_f32_e32 v5, v5, v2
+; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1
+; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3
+; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8
+; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
+; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5]
+; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9
+; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off offset:2044 glc
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: buffer_wbinvl1
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7
+; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
+; GFX90A-NEXT: s_cbranch_execnz .LBB79_1
+; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v3
+; GFX90A-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: global_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX908: ; %bb.0:
+; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044
+; GFX908-NEXT: s_mov_b64 s[6:7], 0
+; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX908-NEXT: s_movk_i32 s8, 0x7fff
+; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX908-NEXT: s_mov_b32 s9, 0x7060302
+; GFX908-NEXT: .LBB79_1: ; %atomicrmw.start
+; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: v_mov_b32_e32 v6, v3
+; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX908-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX908-NEXT: v_add_f32_e32 v5, v5, v2
+; GFX908-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX908-NEXT: v_bfe_u32 v9, v5, 16, 1
+; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v3
+; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX908-NEXT: v_add3_u32 v7, v7, v3, s8
+; GFX908-NEXT: v_add3_u32 v9, v9, v5, s8
+; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
+; GFX908-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5]
+; GFX908-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX908-NEXT: v_perm_b32 v5, v5, v3, s9
+; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off offset:2044 glc
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: buffer_wbinvl1
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6
+; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7]
+; GFX908-NEXT: s_cbranch_execnz .LBB79_1
+; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX908-NEXT: v_mov_b32_e32 v0, v3
+; GFX908-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: global_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fc, v0
+; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc
+; GFX8-NEXT: flat_load_dword v0, v[3:4]
+; GFX8-NEXT: s_mov_b64 s[6:7], 0
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2
+; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX8-NEXT: .LBB79_1: ; %atomicrmw.start
+; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v6, v0
+; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v6
+; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX8-NEXT: v_add_f32_e32 v0, v0, v1
+; GFX8-NEXT: v_add_f32_e32 v5, v5, v2
+; GFX8-NEXT: v_bfe_u32 v7, v0, 16, 1
+; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v0
+; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7
+; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9
+; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v0
+; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
+; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5]
+; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX8-NEXT: v_alignbit_b32 v5, v5, v0, 16
+; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: buffer_wbinvl1
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6
+; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7]
+; GFX8-NEXT: s_cbranch_execnz .LBB79_1
+; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: global_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: s_mov_b32 s6, 0
+; GFX7-NEXT: s_mov_b32 s7, 0xf000
+; GFX7-NEXT: s_mov_b32 s4, s6
+; GFX7-NEXT: s_mov_b32 s5, s6
+; GFX7-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:2044
+; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v3
+; GFX7-NEXT: s_mov_b64 s[8:9], 0
+; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v5
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5
+; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX7-NEXT: .LBB79_1: ; %atomicrmw.start
+; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v2
+; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX7-NEXT: v_add_f32_e32 v7, v7, v5
+; GFX7-NEXT: v_add_f32_e32 v6, v6, v4
+; GFX7-NEXT: v_alignbit_b32 v3, v2, v3, 16
+; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v7
+; GFX7-NEXT: v_alignbit_b32 v2, v2, v6, 16
+; GFX7-NEXT: v_mov_b32_e32 v7, v3
+; GFX7-NEXT: v_mov_b32_e32 v6, v2
+; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 offset:2044 glc
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: buffer_wbinvl1
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v3
+; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v6
+; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
+; GFX7-NEXT: s_cbranch_execnz .LBB79_1
+; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
+; GFX7-NEXT: v_mov_b32_e32 v0, v3
+; GFX7-NEXT: v_mov_b32_e32 v1, v2
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX6-LABEL: global_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT: s_mov_b32 s6, 0
+; GFX6-NEXT: s_mov_b32 s7, 0xf000
+; GFX6-NEXT: s_mov_b32 s4, s6
+; GFX6-NEXT: s_mov_b32 s5, s6
+; GFX6-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:2044
+; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GFX6-NEXT: v_mul_f32_e32 v6, 1.0, v3
+; GFX6-NEXT: s_mov_b64 s[8:9], 0
+; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v5
+; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5
+; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX6-NEXT: .LBB79_1: ; %atomicrmw.start
+; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_and_b32_e32 v7, 0xffff0000, v2
+; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX6-NEXT: v_add_f32_e32 v7, v7, v5
+; GFX6-NEXT: v_add_f32_e32 v6, v6, v4
+; GFX6-NEXT: v_alignbit_b32 v3, v2, v3, 16
+; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v7
+; GFX6-NEXT: v_alignbit_b32 v2, v2, v6, 16
+; GFX6-NEXT: v_mov_b32_e32 v7, v3
+; GFX6-NEXT: v_mov_b32_e32 v6, v2
+; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 offset:2044 glc
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: buffer_wbinvl1
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v3
+; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v6
+; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
+; GFX6-NEXT: s_cbranch_execnz .LBB79_1
+; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
+; GFX6-NEXT: v_mov_b32_e32 v0, v3
+; GFX6-NEXT: v_mov_b32_e32 v1, v2
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: s_setpc_b64 s[30:31]
+ %gep = getelementptr <2 x bfloat>, ptr addrspace(1) %ptr, i64 511
+ %result = atomicrmw fadd ptr addrspace(1) %gep, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+ ret <2 x bfloat> %result
+}
+
+define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: s_wait_storecnt 0x0
+; GFX12-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off offset:-2048 th:TH_ATOMIC_RETURN
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: global_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX940: ; %bb.0:
+; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT: buffer_wbl2 sc1
+; GFX940-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off offset:-2048 sc0
+; GFX940-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NEXT: buffer_inv sc1
+; GFX940-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: global_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:-2048
+; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11-NEXT: s_mov_b32 s1, 0
+; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX11-NEXT: .p2align 6
+; GFX11-NEXT: .LBB80_1: ; %atomicrmw.start
+; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_mov_b32_e32 v6, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX11-NEXT: v_add_f32_e32 v5, v5, v2
+; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX11-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
+; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX11-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0
+; GFX11-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:-2048 glc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: buffer_gl1_inv
+; GFX11-NEXT: buffer_gl0_inv
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
+; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX11-NEXT: s_cbranch_execnz .LBB80_1
+; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX11-NEXT: v_mov_b32_e32 v0, v3
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: global_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:-2048
+; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX10-NEXT: s_mov_b32 s5, 0
+; GFX10-NEXT: .LBB80_1: ; %atomicrmw.start
+; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_mov_b32_e32 v6, v3
+; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX10-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX10-NEXT: v_add_f32_e32 v5, v5, v2
+; GFX10-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX10-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX10-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX10-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
+; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3
+; GFX10-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e64 v3, v7, v9, s4
+; GFX10-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off offset:-2048 glc
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: buffer_gl1_inv
+; GFX10-NEXT: buffer_gl0_inv
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
+; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5
+; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5
+; GFX10-NEXT: s_cbranch_execnz .LBB80_1
+; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5
+; GFX10-NEXT: v_mov_b32_e32 v0, v3
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: global_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX90A: ; %bb.0:
+; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:-2048
+; GFX90A-NEXT: s_mov_b64 s[6:7], 0
+; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
+; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
+; GFX90A-NEXT: .LBB80_1: ; %atomicrmw.start
+; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v7, v3
+; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7
+; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7
+; GFX90A-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX90A-NEXT: v_add_f32_e32 v5, v5, v2
+; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1
+; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3
+; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8
+; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
+; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5]
+; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9
+; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off offset:-2048 glc
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: buffer_wbinvl1
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7
+; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
+; GFX90A-NEXT: s_cbranch_execnz .LBB80_1
+; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v3
+; GFX90A-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: global_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX908: ; %bb.0:
+; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:-2048
+; GFX908-NEXT: s_mov_b64 s[6:7], 0
+; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX908-NEXT: s_movk_i32 s8, 0x7fff
+; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX908-NEXT: s_mov_b32 s9, 0x7060302
+; GFX908-NEXT: .LBB80_1: ; %atomicrmw.start
+; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: v_mov_b32_e32 v6, v3
+; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX908-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX908-NEXT: v_add_f32_e32 v5, v5, v2
+; GFX908-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX908-NEXT: v_bfe_u32 v9, v5, 16, 1
+; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v3
+; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX908-NEXT: v_add3_u32 v7, v7, v3, s8
+; GFX908-NEXT: v_add3_u32 v9, v9, v5, s8
+; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
+; GFX908-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5]
+; GFX908-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX908-NEXT: v_perm_b32 v5, v5, v3, s9
+; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off offset:-2048 glc
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: buffer_wbinvl1
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6
+; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7]
+; GFX908-NEXT: s_cbranch_execnz .LBB80_1
+; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX908-NEXT: v_mov_b32_e32 v0, v3
+; GFX908-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: global_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0xfffff800, v0
+; GFX8-NEXT: v_addc_u32_e32 v4, vcc, -1, v1, vcc
+; GFX8-NEXT: flat_load_dword v0, v[3:4]
+; GFX8-NEXT: s_mov_b64 s[6:7], 0
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2
+; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX8-NEXT: .LBB80_1: ; %atomicrmw.start
+; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_mov_b32_e32 v6, v0
+; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v6
+; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX8-NEXT: v_add_f32_e32 v0, v0, v1
+; GFX8-NEXT: v_add_f32_e32 v5, v5, v2
+; GFX8-NEXT: v_bfe_u32 v7, v0, 16, 1
+; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v0
+; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7
+; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9
+; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v0
+; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
+; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5]
+; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX8-NEXT: v_alignbit_b32 v5, v5, v0, 16
+; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: buffer_wbinvl1
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6
+; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7]
+; GFX8-NEXT: s_cbranch_execnz .LBB80_1
+; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: global_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: s_movk_i32 s4, 0xf800
+; GFX7-NEXT: s_mov_b32 s5, -1
+; GFX7-NEXT: s_mov_b32 s7, 0xf000
+; GFX7-NEXT: s_mov_b32 s6, 0
+; GFX7-NEXT: buffer_load_dword v6, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0xfffff800, v0
+; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v2
+; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GFX7-NEXT: s_mov_b64 s[8:9], 0
+; GFX7-NEXT: s_mov_b32 s4, s6
+; GFX7-NEXT: v_addc_u32_e32 v5, vcc, -1, v1, vcc
+; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v0
+; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT: s_mov_b32 s5, s6
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v6
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v6
+; GFX7-NEXT: .LBB80_1: ; %atomicrmw.start
+; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v1
+; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_add_f32_e32 v7, v7, v3
+; GFX7-NEXT: v_add_f32_e32 v6, v6, v2
+; GFX7-NEXT: v_alignbit_b32 v1, v1, v0, 16
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v7
+; GFX7-NEXT: v_alignbit_b32 v0, v0, v6, 16
+; GFX7-NEXT: v_mov_b32_e32 v7, v1
+; GFX7-NEXT: v_mov_b32_e32 v6, v0
+; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v[4:5], s[4:7], 0 addr64 glc
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: buffer_wbinvl1
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v1
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v6
+; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v6
+; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
+; GFX7-NEXT: s_cbranch_execnz .LBB80_1
+; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX6-LABEL: global_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT: s_movk_i32 s4, 0xf800
+; GFX6-NEXT: s_mov_b32 s5, -1
+; GFX6-NEXT: s_mov_b32 s7, 0xf000
+; GFX6-NEXT: s_mov_b32 s6, 0
+; GFX6-NEXT: buffer_load_dword v6, v[0:1], s[4:7], 0 addr64
+; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0xfffff800, v0
+; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v2
+; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GFX6-NEXT: s_mov_b64 s[8:9], 0
+; GFX6-NEXT: s_mov_b32 s4, s6
+; GFX6-NEXT: v_addc_u32_e32 v5, vcc, -1, v1, vcc
+; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v0
+; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX6-NEXT: s_mov_b32 s5, s6
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v6
+; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v6
+; GFX6-NEXT: .LBB80_1: ; %atomicrmw.start
+; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1
+; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_and_b32_e32 v7, 0xffff0000, v1
+; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v0
+; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; GFX6-NEXT: v_add_f32_e32 v7, v7, v3
+; GFX6-NEXT: v_add_f32_e32 v6, v6, v2
+; GFX6-NEXT: v_alignbit_b32 v1, v1, v0, 16
+; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v7
+; GFX6-NEXT: v_alignbit_b32 v0, v0, v6, 16
+; GFX6-NEXT: v_mov_b32_e32 v7, v1
+; GFX6-NEXT: v_mov_b32_e32 v6, v0
+; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v[4:5], s[4:7], 0 addr64 glc
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: buffer_wbinvl1
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v1
+; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v6
+; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v6
+; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
+; GFX6-NEXT: s_cbranch_execnz .LBB80_1
+; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: s_setpc_b64 s[30:31]
+ %gep = getelementptr <2 x bfloat>, ptr addrspace(1) %ptr, i64 -512
+ %result = atomicrmw fadd ptr addrspace(1) %gep, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+ ret <2 x bfloat> %result
+}
+
+define void @global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: s_wait_storecnt 0x0
+; GFX12-NEXT: global_atomic_pk_add_bf16 v[0:1], v2, off
+; GFX12-NEXT: s_wait_storecnt 0x0
+; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory:
+; GFX940: ; %bb.0:
+; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT: buffer_wbl2 sc1
+; GFX940-NEXT: global_atomic_pk_add_bf16 v[0:1], v2, off
+; GFX940-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NEXT: buffer_inv sc1
+; GFX940-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: global_load_b32 v3, v[0:1], off
+; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
+; GFX11-NEXT: s_mov_b32 s1, 0
+; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX11-NEXT: .p2align 6
+; GFX11-NEXT: .LBB81_1: ; %atomicrmw.start
+; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_add_f32_e32 v2, v2, v4
+; GFX11-NEXT: v_add_f32_e32 v6, v6, v5
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX11-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX11-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
+; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX11-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: buffer_gl1_inv
+; GFX11-NEXT: buffer_gl0_inv
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-NEXT: v_mov_b32_e32 v3, v2
+; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX11-NEXT: s_cbranch_execnz .LBB81_1
+; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: global_load_dword v3, v[0:1], off
+; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
+; GFX10-NEXT: s_mov_b32 s5, 0
+; GFX10-NEXT: .LBB81_1: ; %atomicrmw.start
+; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX10-NEXT: v_add_f32_e32 v2, v2, v4
+; GFX10-NEXT: v_add_f32_e32 v6, v6, v5
+; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX10-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX10-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX10-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
+; GFX10-NEXT: v_cmp_u_f32_e64 s4, v2, v2
+; GFX10-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e64 v2, v7, v9, s4
+; GFX10-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: buffer_gl1_inv
+; GFX10-NEXT: buffer_gl0_inv
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-NEXT: v_mov_b32_e32 v3, v2
+; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5
+; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5
+; GFX10-NEXT: s_cbranch_execnz .LBB81_1
+; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory:
+; GFX90A: ; %bb.0:
+; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: global_load_dword v3, v[0:1], off
+; GFX90A-NEXT: s_mov_b64 s[6:7], 0
+; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
+; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
+; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
+; GFX90A-NEXT: .LBB81_1: ; %atomicrmw.start
+; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX90A-NEXT: v_add_f32_e32 v2, v2, v4
+; GFX90A-NEXT: v_add_f32_e32 v6, v6, v5
+; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8
+; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9
+; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: buffer_wbinvl1
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
+; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
+; GFX90A-NEXT: s_cbranch_execnz .LBB81_1
+; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX90A-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory:
+; GFX908: ; %bb.0:
+; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT: global_load_dword v3, v[0:1], off
+; GFX908-NEXT: s_mov_b64 s[6:7], 0
+; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX908-NEXT: s_movk_i32 s8, 0x7fff
+; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
+; GFX908-NEXT: s_mov_b32 s9, 0x7060302
+; GFX908-NEXT: .LBB81_1: ; %atomicrmw.start
+; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX908-NEXT: v_add_f32_e32 v2, v2, v4
+; GFX908-NEXT: v_add_f32_e32 v6, v6, v5
+; GFX908-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX908-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX908-NEXT: v_add3_u32 v7, v7, v2, s8
+; GFX908-NEXT: v_add3_u32 v9, v9, v6, s8
+; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX908-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX908-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX908-NEXT: v_perm_b32 v2, v6, v2, s9
+; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: buffer_wbinvl1
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GFX908-NEXT: v_mov_b32_e32 v3, v2
+; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7]
+; GFX908-NEXT: s_cbranch_execnz .LBB81_1
+; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX908-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: flat_load_dword v3, v[0:1]
+; GFX8-NEXT: s_mov_b64 s[6:7], 0
+; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
+; GFX8-NEXT: .LBB81_1: ; %atomicrmw.start
+; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX8-NEXT: v_add_f32_e32 v2, v2, v4
+; GFX8-NEXT: v_add_f32_e32 v6, v6, v5
+; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2
+; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7
+; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9
+; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16
+; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: buffer_wbinvl1
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GFX8-NEXT: v_mov_b32_e32 v3, v2
+; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7]
+; GFX8-NEXT: s_cbranch_execnz .LBB81_1
+; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: s_mov_b32 s6, 0
+; GFX7-NEXT: s_mov_b32 s7, 0xf000
+; GFX7-NEXT: s_mov_b32 s4, s6
+; GFX7-NEXT: s_mov_b32 s5, s6
+; GFX7-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GFX7-NEXT: s_mov_b64 s[8:9], 0
+; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5
+; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX7-NEXT: .LBB81_1: ; %atomicrmw.start
+; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
+; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
+; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v5
+; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX7-NEXT: v_add_f32_e32 v7, v7, v3
+; GFX7-NEXT: v_add_f32_e32 v6, v6, v2
+; GFX7-NEXT: v_alignbit_b32 v5, v4, v5, 16
+; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v7
+; GFX7-NEXT: v_alignbit_b32 v4, v4, v6, 16
+; GFX7-NEXT: v_mov_b32_e32 v7, v5
+; GFX7-NEXT: v_mov_b32_e32 v6, v4
+; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: buffer_wbinvl1
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5
+; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v6
+; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v6
+; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
+; GFX7-NEXT: s_cbranch_execnz .LBB81_1
+; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX6-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT: s_mov_b32 s6, 0
+; GFX6-NEXT: s_mov_b32 s7, 0xf000
+; GFX6-NEXT: s_mov_b32 s4, s6
+; GFX6-NEXT: s_mov_b32 s5, s6
+; GFX6-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64
+; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GFX6-NEXT: s_mov_b64 s[8:9], 0
+; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v5
+; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX6-NEXT: .LBB81_1: ; %atomicrmw.start
+; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; GFX6-NEXT: v_mul_f32_e32 v5, 1.0, v5
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
+; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v5
+; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX6-NEXT: v_add_f32_e32 v7, v7, v3
+; GFX6-NEXT: v_add_f32_e32 v6, v6, v2
+; GFX6-NEXT: v_alignbit_b32 v5, v4, v5, 16
+; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v7
+; GFX6-NEXT: v_alignbit_b32 v4, v4, v6, 16
+; GFX6-NEXT: v_mov_b32_e32 v7, v5
+; GFX6-NEXT: v_mov_b32_e32 v6, v4
+; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: buffer_wbinvl1
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5
+; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v6
+; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v6
+; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
+; GFX6-NEXT: s_cbranch_execnz .LBB81_1
+; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: s_setpc_b64 s[30:31]
+ %unused = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+ ret void
+}
+
+define void @global_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: s_wait_storecnt 0x0
+; GFX12-NEXT: global_atomic_pk_add_bf16 v[0:1], v2, off offset:2044
+; GFX12-NEXT: s_wait_storecnt 0x0
+; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX940: ; %bb.0:
+; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT: buffer_wbl2 sc1
+; GFX940-NEXT: global_atomic_pk_add_bf16 v[0:1], v2, off offset:2044
+; GFX940-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NEXT: buffer_inv sc1
+; GFX940-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044
+; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
+; GFX11-NEXT: s_mov_b32 s1, 0
+; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX11-NEXT: .p2align 6
+; GFX11-NEXT: .LBB82_1: ; %atomicrmw.start
+; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_add_f32_e32 v2, v2, v4
+; GFX11-NEXT: v_add_f32_e32 v6, v6, v5
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX11-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX11-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
+; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX11-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 glc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: buffer_gl1_inv
+; GFX11-NEXT: buffer_gl0_inv
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-NEXT: v_mov_b32_e32 v3, v2
+; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX11-NEXT: s_cbranch_execnz .LBB82_1
+; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044
+; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
+; GFX10-NEXT: s_mov_b32 s5, 0
+; GFX10-NEXT: .LBB82_1: ; %atomicrmw.start
+; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX10-NEXT: v_add_f32_e32 v2, v2, v4
+; GFX10-NEXT: v_add_f32_e32 v6, v6, v5
+; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX10-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX10-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX10-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
+; GFX10-NEXT: v_cmp_u_f32_e64 s4, v2, v2
+; GFX10-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e64 v2, v7, v9, s4
+; GFX10-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: buffer_gl1_inv
+; GFX10-NEXT: buffer_gl0_inv
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-NEXT: v_mov_b32_e32 v3, v2
+; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5
+; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5
+; GFX10-NEXT: s_cbranch_execnz .LBB82_1
+; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX90A: ; %bb.0:
+; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044
+; GFX90A-NEXT: s_mov_b64 s[6:7], 0
+; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
+; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
+; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
+; GFX90A-NEXT: .LBB82_1: ; %atomicrmw.start
+; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX90A-NEXT: v_add_f32_e32 v2, v2, v4
+; GFX90A-NEXT: v_add_f32_e32 v6, v6, v5
+; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8
+; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9
+; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: buffer_wbinvl1
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
+; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
+; GFX90A-NEXT: s_cbranch_execnz .LBB82_1
+; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX90A-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX908: ; %bb.0:
+; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044
+; GFX908-NEXT: s_mov_b64 s[6:7], 0
+; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX908-NEXT: s_movk_i32 s8, 0x7fff
+; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
+; GFX908-NEXT: s_mov_b32 s9, 0x7060302
+; GFX908-NEXT: .LBB82_1: ; %atomicrmw.start
+; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX908-NEXT: v_add_f32_e32 v2, v2, v4
+; GFX908-NEXT: v_add_f32_e32 v6, v6, v5
+; GFX908-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX908-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX908-NEXT: v_add3_u32 v7, v7, v2, s8
+; GFX908-NEXT: v_add3_u32 v9, v9, v6, s8
+; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX908-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX908-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX908-NEXT: v_perm_b32 v2, v6, v2, s9
+; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: buffer_wbinvl1
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GFX908-NEXT: v_mov_b32_e32 v3, v2
+; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7]
+; GFX908-NEXT: s_cbranch_execnz .LBB82_1
+; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX908-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT: flat_load_dword v3, v[0:1]
+; GFX8-NEXT: s_mov_b64 s[6:7], 0
+; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
+; GFX8-NEXT: .LBB82_1: ; %atomicrmw.start
+; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX8-NEXT: v_add_f32_e32 v2, v2, v4
+; GFX8-NEXT: v_add_f32_e32 v6, v6, v5
+; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2
+; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7
+; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9
+; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16
+; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: buffer_wbinvl1
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GFX8-NEXT: v_mov_b32_e32 v3, v2
+; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7]
+; GFX8-NEXT: s_cbranch_execnz .LBB82_1
+; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX7-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: s_mov_b32 s6, 0
+; GFX7-NEXT: s_mov_b32 s7, 0xf000
+; GFX7-NEXT: s_mov_b32 s4, s6
+; GFX7-NEXT: s_mov_b32 s5, s6
+; GFX7-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:2044
+; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GFX7-NEXT: s_mov_b64 s[8:9], 0
+; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5
+; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX7-NEXT: .LBB82_1: ; %atomicrmw.start
+; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
+; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
+; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v5
+; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX7-NEXT: v_add_f32_e32 v7, v7, v3
+; GFX7-NEXT: v_add_f32_e32 v6, v6, v2
+; GFX7-NEXT: v_alignbit_b32 v5, v4, v5, 16
+; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v7
+; GFX7-NEXT: v_alignbit_b32 v4, v4, v6, 16
+; GFX7-NEXT: v_mov_b32_e32 v7, v5
+; GFX7-NEXT: v_mov_b32_e32 v6, v4
+; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 offset:2044 glc
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: buffer_wbinvl1
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5
+; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v6
+; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v6
+; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
+; GFX7-NEXT: s_cbranch_execnz .LBB82_1
+; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
+; GFX7-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX6-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT: s_mov_b32 s6, 0
+; GFX6-NEXT: s_mov_b32 s7, 0xf000
+; GFX6-NEXT: s_mov_b32 s4, s6
+; GFX6-NEXT: s_mov_b32 s5, s6
+; GFX6-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:2044
+; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GFX6-NEXT: s_mov_b64 s[8:9], 0
+; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v5
+; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX6-NEXT: .LBB82_1: ; %atomicrmw.start
+; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; GFX6-NEXT: v_mul_f32_e32 v5, 1.0, v5
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
+; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v5
+; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX6-NEXT: v_add_f32_e32 v7, v7, v3
+; GFX6-NEXT: v_add_f32_e32 v6, v6, v2
+; GFX6-NEXT: v_alignbit_b32 v5, v4, v5, 16
+; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v7
+; GFX6-NEXT: v_alignbit_b32 v4, v4, v6, 16
+; GFX6-NEXT: v_mov_b32_e32 v7, v5
+; GFX6-NEXT: v_mov_b32_e32 v6, v4
+; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 offset:2044 glc
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: buffer_wbinvl1
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5
+; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v6
+; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v6
+; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
+; GFX6-NEXT: s_cbranch_execnz .LBB82_1
+; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: s_setpc_b64 s[30:31]
+ %gep = getelementptr <2 x bfloat>, ptr addrspace(1) %ptr, i64 511
+ %unused = atomicrmw fadd ptr addrspace(1) %gep, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+ ret void
+}
+
+define void @global_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: s_wait_storecnt 0x0
+; GFX12-NEXT: global_atomic_pk_add_bf16 v[0:1], v2, off offset:-2048
+; GFX12-NEXT: s_wait_storecnt 0x0
+; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX940: ; %bb.0:
+; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT: buffer_wbl2 sc1
+; GFX940-NEXT: global_atomic_pk_add_bf16 v[0:1], v2, off offset:-2048
+; GFX940-NEXT: s_waitcnt vmcnt(0)
+; GFX940-NEXT: buffer_inv sc1
+; GFX940-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:-2048
+; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
+; GFX11-NEXT: s_mov_b32 s1, 0
+; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX11-NEXT: .p2align 6
+; GFX11-NEXT: .LBB83_1: ; %atomicrmw.start
+; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_add_f32_e32 v2, v2, v4
+; GFX11-NEXT: v_add_f32_e32 v6, v6, v5
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX11-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX11-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
+; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX11-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:-2048 glc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: buffer_gl1_inv
+; GFX11-NEXT: buffer_gl0_inv
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-NEXT: v_mov_b32_e32 v3, v2
+; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX11-NEXT: s_cbranch_execnz .LBB83_1
+; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX10: ; %bb.0:
+; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:-2048
+; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
+; GFX10-NEXT: s_mov_b32 s5, 0
+; GFX10-NEXT: .LBB83_1: ; %atomicrmw.start
+; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX10-NEXT: v_add_f32_e32 v2, v2, v4
+; GFX10-NEXT: v_add_f32_e32 v6, v6, v5
+; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX10-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX10-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX10-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
+; GFX10-NEXT: v_cmp_u_f32_e64 s4, v2, v2
+; GFX10-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e64 v2, v7, v9, s4
+; GFX10-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: buffer_gl1_inv
+; GFX10-NEXT: buffer_gl0_inv
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-NEXT: v_mov_b32_e32 v3, v2
+; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5
+; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5
+; GFX10-NEXT: s_cbranch_execnz .LBB83_1
+; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5
+; GFX10-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX90A: ; %bb.0:
+; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:-2048
+; GFX90A-NEXT: s_mov_b64 s[6:7], 0
+; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
+; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
+; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
+; GFX90A-NEXT: .LBB83_1: ; %atomicrmw.start
+; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX90A-NEXT: v_add_f32_e32 v2, v2, v4
+; GFX90A-NEXT: v_add_f32_e32 v6, v6, v5
+; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8
+; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9
+; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: buffer_wbinvl1
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
+; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
+; GFX90A-NEXT: s_cbranch_execnz .LBB83_1
+; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX90A-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX908: ; %bb.0:
+; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:-2048
+; GFX908-NEXT: s_mov_b64 s[6:7], 0
+; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX908-NEXT: s_movk_i32 s8, 0x7fff
+; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
+; GFX908-NEXT: s_mov_b32 s9, 0x7060302
+; GFX908-NEXT: .LBB83_1: ; %atomicrmw.start
+; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX908-NEXT: v_add_f32_e32 v2, v2, v4
+; GFX908-NEXT: v_add_f32_e32 v6, v6, v5
+; GFX908-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX908-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX908-NEXT: v_add3_u32 v7, v7, v2, s8
+; GFX908-NEXT: v_add3_u32 v9, v9, v6, s8
+; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX908-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX908-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX908-NEXT: v_perm_b32 v2, v6, v2, s9
+; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: buffer_wbinvl1
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GFX908-NEXT: v_mov_b32_e32 v3, v2
+; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7]
+; GFX908-NEXT: s_cbranch_execnz .LBB83_1
+; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX908-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0xfffff800, v0
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc
+; GFX8-NEXT: flat_load_dword v3, v[0:1]
+; GFX8-NEXT: s_mov_b64 s[6:7], 0
+; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
+; GFX8-NEXT: .LBB83_1: ; %atomicrmw.start
+; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT: s_waitcnt vmcnt(0)
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX8-NEXT: v_add_f32_e32 v2, v2, v4
+; GFX8-NEXT: v_add_f32_e32 v6, v6, v5
+; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2
+; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7
+; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9
+; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16
+; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
-; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GFX8-NEXT: v_mov_b32_e32 v4, v3
-; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX8-NEXT: s_cbranch_execnz .LBB49_1
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GFX8-NEXT: v_mov_b32_e32 v3, v2
+; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7]
+; GFX8-NEXT: s_cbranch_execnz .LBB83_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX7-LABEL: global_system_atomic_fadd_noret_v2f16__offset12b_pos:
+; GFX7-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: s_mov_b32 s6, 0
+; GFX7-NEXT: s_movk_i32 s4, 0xf800
+; GFX7-NEXT: s_mov_b32 s5, -1
; GFX7-NEXT: s_mov_b32 s7, 0xf000
+; GFX7-NEXT: s_mov_b32 s6, 0
+; GFX7-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0
+; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GFX7-NEXT: s_mov_b64 s[8:9], 0
; GFX7-NEXT: s_mov_b32 s4, s6
+; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc
+; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX7-NEXT: s_mov_b32 s5, s6
-; GFX7-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:2044
-; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3
-; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v2
-; GFX7-NEXT: s_mov_b64 s[8:9], 0
-; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v3
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v5
-; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v5
-; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v3
-; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6
-; GFX7-NEXT: .LBB49_1: ; %atomicrmw.start
-; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5
-; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4
-; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5
-; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4
+; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5
; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX7-NEXT: v_add_f32_e32 v6, v6, v2
+; GFX7-NEXT: .LBB83_1: ; %atomicrmw.start
+; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
+; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
+; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v5
+; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4
; GFX7-NEXT: v_add_f32_e32 v7, v7, v3
-; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6
-; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7
-; GFX7-NEXT: v_or_b32_e32 v6, v4, v5
-; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8
-; GFX7-NEXT: v_or_b32_e32 v5, v7, v4
-; GFX7-NEXT: v_mov_b32_e32 v8, v6
+; GFX7-NEXT: v_add_f32_e32 v6, v6, v2
+; GFX7-NEXT: v_alignbit_b32 v5, v4, v5, 16
+; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v7
+; GFX7-NEXT: v_alignbit_b32 v4, v4, v6, 16
; GFX7-NEXT: v_mov_b32_e32 v7, v5
-; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 offset:2044 glc
+; GFX7-NEXT: v_mov_b32_e32 v6, v4
+; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7
-; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7
-; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5
+; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v6
; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v6
; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT: s_cbranch_execnz .LBB49_1
+; GFX7-NEXT: s_cbranch_execnz .LBB83_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
-; GFX6-LABEL: global_system_atomic_fadd_noret_v2f16__offset12b_pos:
+; GFX6-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT: s_mov_b32 s6, 0
+; GFX6-NEXT: s_movk_i32 s4, 0xf800
+; GFX6-NEXT: s_mov_b32 s5, -1
; GFX6-NEXT: s_mov_b32 s7, 0xf000
+; GFX6-NEXT: s_mov_b32 s6, 0
+; GFX6-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64
+; GFX6-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0
+; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GFX6-NEXT: s_mov_b64 s[8:9], 0
; GFX6-NEXT: s_mov_b32 s4, s6
+; GFX6-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc
+; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX6-NEXT: s_mov_b32 s5, s6
-; GFX6-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:2044
-; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3
-; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v2
-; GFX6-NEXT: s_mov_b64 s[8:9], 0
-; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v3
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v5
-; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v5
-; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v3
-; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6
-; GFX6-NEXT: .LBB49_1: ; %atomicrmw.start
+; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v5
+; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX6-NEXT: .LBB83_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5
-; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4
-; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v5
+; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; GFX6-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v4
-; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX6-NEXT: v_add_f32_e32 v6, v6, v2
+; GFX6-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
+; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v5
+; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v4
; GFX6-NEXT: v_add_f32_e32 v7, v7, v3
-; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v6
-; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v7
-; GFX6-NEXT: v_or_b32_e32 v6, v4, v5
-; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v8
-; GFX6-NEXT: v_or_b32_e32 v5, v7, v4
-; GFX6-NEXT: v_mov_b32_e32 v8, v6
+; GFX6-NEXT: v_add_f32_e32 v6, v6, v2
+; GFX6-NEXT: v_alignbit_b32 v5, v4, v5, 16
+; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v7
+; GFX6-NEXT: v_alignbit_b32 v4, v4, v6, 16
; GFX6-NEXT: v_mov_b32_e32 v7, v5
-; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v[0:1], s[4:7], 0 addr64 offset:2044 glc
+; GFX6-NEXT: v_mov_b32_e32 v6, v4
+; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v7
-; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v7
-; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5
+; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v6
; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v6
; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT: s_cbranch_execnz .LBB49_1
+; GFX6-NEXT: s_cbranch_execnz .LBB83_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
- %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i64 511
- %unused = atomicrmw fadd ptr addrspace(1) %gep, <2 x half> %val seq_cst
+ %gep = getelementptr <2 x bfloat>, ptr addrspace(1) %ptr, i64 -512
+ %unused = atomicrmw fadd ptr addrspace(1) %gep, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
ret void
}
-; --------------------------------------------------------------------
-; <2 x bfloat>
-; --------------------------------------------------------------------
-
-define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fadd_ret_v2bf16:
+define <2 x bfloat> @global_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 {
+; GFX12-LABEL: global_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: s_wait_expcnt 0x0
@@ -13047,30 +19932,30 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16(ptr addrspace(1) %ptr,
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN
+; GFX12-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: global_inv scope:SCOPE_SYS
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
-; GFX940-LABEL: global_agent_atomic_fadd_ret_v2bf16:
+; GFX940-LABEL: global_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off sc0
+; GFX940-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off offset:2044 sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: buffer_inv sc1
+; GFX940-NEXT: buffer_inv sc0 sc1
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: global_agent_atomic_fadd_ret_v2bf16:
+; GFX11-LABEL: global_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v3, v[0:1], off
+; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044
; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX11-NEXT: s_mov_b32 s1, 0
; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB50_1: ; %atomicrmw.start
+; GFX11-NEXT: .LBB84_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v6, v3
@@ -13095,7 +19980,7 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16(ptr addrspace(1) %ptr,
; GFX11-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0
; GFX11-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off glc
+; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:2044 glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
@@ -13103,21 +19988,21 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16(ptr addrspace(1) %ptr,
; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT: s_cbranch_execnz .LBB50_1
+; GFX11-NEXT: s_cbranch_execnz .LBB84_1
; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX11-NEXT: s_set_inst_prefetch_distance 0x2
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1
; GFX11-NEXT: v_mov_b32_e32 v0, v3
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
-; GFX10-LABEL: global_agent_atomic_fadd_ret_v2bf16:
+; GFX10-LABEL: global_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v3, v[0:1], off
+; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044
; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX10-NEXT: s_mov_b32 s5, 0
-; GFX10-NEXT: .LBB50_1: ; %atomicrmw.start
+; GFX10-NEXT: .LBB84_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v6, v3
@@ -13137,29 +20022,29 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16(ptr addrspace(1) %ptr,
; GFX10-NEXT: v_cndmask_b32_e64 v3, v7, v9, s4
; GFX10-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off glc
+; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off offset:2044 glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5
-; GFX10-NEXT: s_cbranch_execnz .LBB50_1
+; GFX10-NEXT: s_cbranch_execnz .LBB84_1
; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5
; GFX10-NEXT: v_mov_b32_e32 v0, v3
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX90A-LABEL: global_agent_atomic_fadd_ret_v2bf16:
+; GFX90A-LABEL: global_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: global_load_dword v3, v[0:1], off
+; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
-; GFX90A-NEXT: .LBB50_1: ; %atomicrmw.start
+; GFX90A-NEXT: .LBB84_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v7, v3
@@ -13178,28 +20063,30 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16(ptr addrspace(1) %ptr,
; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5]
; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9
-; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off glc
+; GFX90A-NEXT: buffer_wbl2
+; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off offset:2044 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: buffer_invl2
; GFX90A-NEXT: buffer_wbinvl1
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
-; GFX90A-NEXT: s_cbranch_execnz .LBB50_1
+; GFX90A-NEXT: s_cbranch_execnz .LBB84_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX90A-NEXT: v_mov_b32_e32 v0, v3
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
-; GFX908-LABEL: global_agent_atomic_fadd_ret_v2bf16:
+; GFX908-LABEL: global_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: global_load_dword v3, v[0:1], off
+; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044
; GFX908-NEXT: s_mov_b64 s[6:7], 0
; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX908-NEXT: s_movk_i32 s8, 0x7fff
; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX908-NEXT: s_mov_b32 s9, 0x7060302
-; GFX908-NEXT: .LBB50_1: ; %atomicrmw.start
+; GFX908-NEXT: .LBB84_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: v_mov_b32_e32 v6, v3
@@ -13218,67 +20105,68 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16(ptr addrspace(1) %ptr,
; GFX908-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5]
; GFX908-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
; GFX908-NEXT: v_perm_b32 v5, v5, v3, s9
-; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off glc
+; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off offset:2044 glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6
; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7]
-; GFX908-NEXT: s_cbranch_execnz .LBB50_1
+; GFX908-NEXT: s_cbranch_execnz .LBB84_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX908-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX908-NEXT: v_mov_b32_e32 v0, v3
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
-; GFX8-LABEL: global_agent_atomic_fadd_ret_v2bf16:
+; GFX8-LABEL: global_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: flat_load_dword v3, v[0:1]
+; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fc, v0
+; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc
+; GFX8-NEXT: flat_load_dword v0, v[3:4]
; GFX8-NEXT: s_mov_b64 s[6:7], 0
-; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2
; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX8-NEXT: .LBB50_1: ; %atomicrmw.start
+; GFX8-NEXT: .LBB84_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v6, v3
-; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX8-NEXT: v_mov_b32_e32 v6, v0
+; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v6
; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
-; GFX8-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX8-NEXT: v_add_f32_e32 v0, v0, v1
; GFX8-NEXT: v_add_f32_e32 v5, v5, v2
-; GFX8-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX8-NEXT: v_bfe_u32 v7, v0, 16, 1
; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1
-; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v3
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v0
; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5
; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7
; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9
; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
+; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v0
+; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
-; GFX8-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5]
+; GFX8-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5]
; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT: v_alignbit_b32 v5, v5, v3, 16
-; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[5:6] glc
+; GFX8-NEXT: v_alignbit_b32 v5, v5, v0, 16
+; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6
; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7]
-; GFX8-NEXT: s_cbranch_execnz .LBB50_1
+; GFX8-NEXT: s_cbranch_execnz .LBB84_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX8-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX8-NEXT: v_mov_b32_e32 v0, v3
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX7-LABEL: global_agent_atomic_fadd_ret_v2bf16:
+; GFX7-LABEL: global_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: s_mov_b32 s6, 0
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
-; GFX7-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:2044
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v3
; GFX7-NEXT: s_mov_b64 s[8:9], 0
@@ -13287,7 +20175,7 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16(ptr addrspace(1) %ptr,
; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v5
; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5
; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
-; GFX7-NEXT: .LBB50_1: ; %atomicrmw.start
+; GFX7-NEXT: .LBB84_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
@@ -13301,7 +20189,7 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16(ptr addrspace(1) %ptr,
; GFX7-NEXT: v_alignbit_b32 v2, v2, v6, 16
; GFX7-NEXT: v_mov_b32_e32 v7, v3
; GFX7-NEXT: v_mov_b32_e32 v6, v2
-; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 glc
+; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 offset:2044 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v3
@@ -13309,21 +20197,21 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16(ptr addrspace(1) %ptr,
; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v6
; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT: s_cbranch_execnz .LBB50_1
+; GFX7-NEXT: s_cbranch_execnz .LBB84_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX7-NEXT: v_mov_b32_e32 v0, v3
; GFX7-NEXT: v_mov_b32_e32 v1, v2
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
-; GFX6-LABEL: global_agent_atomic_fadd_ret_v2bf16:
+; GFX6-LABEL: global_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: s_mov_b32 s6, 0
; GFX6-NEXT: s_mov_b32 s7, 0xf000
; GFX6-NEXT: s_mov_b32 s4, s6
; GFX6-NEXT: s_mov_b32 s5, s6
-; GFX6-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64
+; GFX6-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:2044
; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX6-NEXT: v_mul_f32_e32 v6, 1.0, v3
; GFX6-NEXT: s_mov_b64 s[8:9], 0
@@ -13332,7 +20220,7 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16(ptr addrspace(1) %ptr,
; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v5
; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5
; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
-; GFX6-NEXT: .LBB50_1: ; %atomicrmw.start
+; GFX6-NEXT: .LBB84_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3
@@ -13347,7 +20235,7 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16(ptr addrspace(1) %ptr,
; GFX6-NEXT: v_alignbit_b32 v2, v2, v6, 16
; GFX6-NEXT: v_mov_b32_e32 v7, v3
; GFX6-NEXT: v_mov_b32_e32 v6, v2
-; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 glc
+; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 offset:2044 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v3
@@ -13355,19 +20243,20 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16(ptr addrspace(1) %ptr,
; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v6
; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT: s_cbranch_execnz .LBB50_1
+; GFX6-NEXT: s_cbranch_execnz .LBB84_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX6-NEXT: v_mov_b32_e32 v0, v3
; GFX6-NEXT: v_mov_b32_e32 v1, v2
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
- %result = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %val syncscope("agent") seq_cst
+ %gep = getelementptr <2 x bfloat>, ptr addrspace(1) %ptr, i64 511
+ %result = atomicrmw fadd ptr addrspace(1) %gep, <2 x bfloat> %val seq_cst, !amdgpu.no.fine.grained.memory !0
ret <2 x bfloat> %result
}
-define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__offset12b_pos(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fadd_ret_v2bf16__offset12b_pos:
+define void @global_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 {
+; GFX12-LABEL: global_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: s_wait_expcnt 0x0
@@ -13375,232 +20264,229 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__offset12b_pos(ptr addr
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: global_atomic_pk_add_bf16 v[0:1], v2, off offset:2044
+; GFX12-NEXT: s_wait_storecnt 0x0
+; GFX12-NEXT: global_inv scope:SCOPE_SYS
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
-; GFX940-LABEL: global_agent_atomic_fadd_ret_v2bf16__offset12b_pos:
+; GFX940-LABEL: global_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off offset:2044 sc0
+; GFX940-NEXT: buffer_wbl2 sc0 sc1
+; GFX940-NEXT: global_atomic_pk_add_bf16 v[0:1], v2, off offset:2044 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: buffer_inv sc1
+; GFX940-NEXT: buffer_inv sc0 sc1
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: global_agent_atomic_fadd_ret_v2bf16__offset12b_pos:
+; GFX11-LABEL: global_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044
; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX11-NEXT: s_mov_b32 s1, 0
; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB51_1: ; %atomicrmw.start
+; GFX11-NEXT: .LBB85_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v6, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
-; GFX11-NEXT: v_add_f32_e32 v5, v5, v2
-; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX11-NEXT: v_add_f32_e32 v3, v3, v4
-; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
-; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3
-; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
-; GFX11-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0
-; GFX11-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
+; GFX11-NEXT: v_add_f32_e32 v2, v2, v4
+; GFX11-NEXT: v_add_f32_e32 v6, v6, v5
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX11-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX11-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
+; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX11-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:2044 glc
+; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-NEXT: v_mov_b32_e32 v3, v2
; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT: s_cbranch_execnz .LBB51_1
+; GFX11-NEXT: s_cbranch_execnz .LBB85_1
; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX11-NEXT: s_set_inst_prefetch_distance 0x2
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT: v_mov_b32_e32 v0, v3
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
-; GFX10-LABEL: global_agent_atomic_fadd_ret_v2bf16__offset12b_pos:
+; GFX10-LABEL: global_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044
; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX10-NEXT: s_mov_b32 s5, 0
-; GFX10-NEXT: .LBB51_1: ; %atomicrmw.start
+; GFX10-NEXT: .LBB85_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v6, v3
-; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v6
-; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
-; GFX10-NEXT: v_add_f32_e32 v3, v3, v4
-; GFX10-NEXT: v_add_f32_e32 v5, v5, v2
-; GFX10-NEXT: v_bfe_u32 v7, v3, 16, 1
-; GFX10-NEXT: v_bfe_u32 v8, v5, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v3
-; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v5
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX10-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
-; GFX10-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
-; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3
-; GFX10-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e64 v3, v7, v9, s4
-; GFX10-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
+; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX10-NEXT: v_add_f32_e32 v2, v2, v4
+; GFX10-NEXT: v_add_f32_e32 v6, v6, v5
+; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX10-NEXT: v_bfe_u32 v8, v6, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v2
+; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX10-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
+; GFX10-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
+; GFX10-NEXT: v_cmp_u_f32_e64 s4, v2, v2
+; GFX10-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e64 v2, v7, v9, s4
+; GFX10-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off offset:2044 glc
+; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-NEXT: v_mov_b32_e32 v3, v2
; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5
-; GFX10-NEXT: s_cbranch_execnz .LBB51_1
+; GFX10-NEXT: s_cbranch_execnz .LBB85_1
; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5
-; GFX10-NEXT: v_mov_b32_e32 v0, v3
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX90A-LABEL: global_agent_atomic_fadd_ret_v2bf16__offset12b_pos:
+; GFX90A-LABEL: global_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
-; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
-; GFX90A-NEXT: .LBB51_1: ; %atomicrmw.start
+; GFX90A-NEXT: .LBB85_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v7, v3
-; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7
-; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7
-; GFX90A-NEXT: v_add_f32_e32 v3, v3, v4
-; GFX90A-NEXT: v_add_f32_e32 v5, v5, v2
-; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1
-; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5
-; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8
-; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5]
-; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
-; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9
-; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off offset:2044 glc
+; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX90A-NEXT: v_add_f32_e32 v2, v2, v4
+; GFX90A-NEXT: v_add_f32_e32 v6, v6, v5
+; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8
+; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9
+; GFX90A-NEXT: buffer_wbl2
+; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: buffer_invl2
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
-; GFX90A-NEXT: s_cbranch_execnz .LBB51_1
+; GFX90A-NEXT: s_cbranch_execnz .LBB85_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX90A-NEXT: v_mov_b32_e32 v0, v3
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
-; GFX908-LABEL: global_agent_atomic_fadd_ret_v2bf16__offset12b_pos:
+; GFX908-LABEL: global_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044
; GFX908-NEXT: s_mov_b64 s[6:7], 0
; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX908-NEXT: s_movk_i32 s8, 0x7fff
-; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX908-NEXT: s_mov_b32 s9, 0x7060302
-; GFX908-NEXT: .LBB51_1: ; %atomicrmw.start
+; GFX908-NEXT: .LBB85_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_mov_b32_e32 v6, v3
-; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v6
-; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
-; GFX908-NEXT: v_add_f32_e32 v3, v3, v4
-; GFX908-NEXT: v_add_f32_e32 v5, v5, v2
-; GFX908-NEXT: v_bfe_u32 v7, v3, 16, 1
-; GFX908-NEXT: v_bfe_u32 v9, v5, 16, 1
-; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v3
-; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v5
-; GFX908-NEXT: v_add3_u32 v7, v7, v3, s8
-; GFX908-NEXT: v_add3_u32 v9, v9, v5, s8
-; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX908-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5]
-; GFX908-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
-; GFX908-NEXT: v_perm_b32 v5, v5, v3, s9
-; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off offset:2044 glc
+; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX908-NEXT: v_add_f32_e32 v2, v2, v4
+; GFX908-NEXT: v_add_f32_e32 v6, v6, v5
+; GFX908-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX908-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX908-NEXT: v_add3_u32 v7, v7, v2, s8
+; GFX908-NEXT: v_add3_u32 v9, v9, v6, s8
+; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX908-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX908-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX908-NEXT: v_perm_b32 v2, v6, v2, s9
+; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GFX908-NEXT: v_mov_b32_e32 v3, v2
; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7]
-; GFX908-NEXT: s_cbranch_execnz .LBB51_1
+; GFX908-NEXT: s_cbranch_execnz .LBB85_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX908-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX908-NEXT: v_mov_b32_e32 v0, v3
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
-; GFX8-LABEL: global_agent_atomic_fadd_ret_v2bf16__offset12b_pos:
+; GFX8-LABEL: global_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fc, v0
-; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_load_dword v0, v[3:4]
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT: flat_load_dword v3, v[0:1]
; GFX8-NEXT: s_mov_b64 s[6:7], 0
-; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2
-; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX8-NEXT: .LBB51_1: ; %atomicrmw.start
+; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
+; GFX8-NEXT: .LBB85_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v6, v0
-; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v6
-; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
-; GFX8-NEXT: v_add_f32_e32 v0, v0, v1
-; GFX8-NEXT: v_add_f32_e32 v5, v5, v2
-; GFX8-NEXT: v_bfe_u32 v7, v0, 16, 1
-; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1
-; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v0
-; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5
+; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX8-NEXT: v_add_f32_e32 v2, v2, v4
+; GFX8-NEXT: v_add_f32_e32 v6, v6, v5
+; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1
+; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2
+; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6
; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7
; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9
-; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5
-; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v0
-; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
-; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
-; GFX8-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5]
-; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT: v_alignbit_b32 v5, v5, v0, 16
-; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
+; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
+; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2
+; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6
+; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16
+; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
+; GFX8-NEXT: v_mov_b32_e32 v3, v2
; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7]
-; GFX8-NEXT: s_cbranch_execnz .LBB51_1
+; GFX8-NEXT: s_cbranch_execnz .LBB85_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX8-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX7-LABEL: global_agent_atomic_fadd_ret_v2bf16__offset12b_pos:
+; GFX7-LABEL: global_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: s_mov_b32 s6, 0
@@ -13609,43 +20495,41 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__offset12b_pos(ptr addr
; GFX7-NEXT: s_mov_b32 s5, s6
; GFX7-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:2044
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v3
+; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GFX7-NEXT: s_mov_b64 s[8:9], 0
-; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
+; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v5
-; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5
-; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
-; GFX7-NEXT: .LBB51_1: ; %atomicrmw.start
+; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5
+; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX7-NEXT: .LBB85_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v2
-; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
-; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX7-NEXT: v_add_f32_e32 v7, v7, v5
-; GFX7-NEXT: v_add_f32_e32 v6, v6, v4
-; GFX7-NEXT: v_alignbit_b32 v3, v2, v3, 16
-; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v7
-; GFX7-NEXT: v_alignbit_b32 v2, v2, v6, 16
-; GFX7-NEXT: v_mov_b32_e32 v7, v3
-; GFX7-NEXT: v_mov_b32_e32 v6, v2
+; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
+; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
+; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v5
+; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX7-NEXT: v_add_f32_e32 v7, v7, v3
+; GFX7-NEXT: v_add_f32_e32 v6, v6, v2
+; GFX7-NEXT: v_alignbit_b32 v5, v4, v5, 16
+; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v7
+; GFX7-NEXT: v_alignbit_b32 v4, v4, v6, 16
+; GFX7-NEXT: v_mov_b32_e32 v7, v5
+; GFX7-NEXT: v_mov_b32_e32 v6, v4
; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 offset:2044 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v3
-; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v6
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5
+; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v6
; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v6
; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT: s_cbranch_execnz .LBB51_1
+; GFX7-NEXT: s_cbranch_execnz .LBB85_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
-; GFX7-NEXT: v_mov_b32_e32 v0, v3
-; GFX7-NEXT: v_mov_b32_e32 v1, v2
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
-; GFX6-LABEL: global_agent_atomic_fadd_ret_v2bf16__offset12b_pos:
+; GFX6-LABEL: global_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: s_mov_b32 s6, 0
@@ -13654,50 +20538,48 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__offset12b_pos(ptr addr
; GFX6-NEXT: s_mov_b32 s5, s6
; GFX6-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:2044
; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX6-NEXT: v_mul_f32_e32 v6, 1.0, v3
+; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GFX6-NEXT: s_mov_b64 s[8:9], 0
-; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
+; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v5
-; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5
-; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
-; GFX6-NEXT: .LBB51_1: ; %atomicrmw.start
+; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v5
+; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX6-NEXT: .LBB85_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; GFX6-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v7, 0xffff0000, v2
-; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
-; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX6-NEXT: v_add_f32_e32 v7, v7, v5
-; GFX6-NEXT: v_add_f32_e32 v6, v6, v4
-; GFX6-NEXT: v_alignbit_b32 v3, v2, v3, 16
-; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v7
-; GFX6-NEXT: v_alignbit_b32 v2, v2, v6, 16
-; GFX6-NEXT: v_mov_b32_e32 v7, v3
-; GFX6-NEXT: v_mov_b32_e32 v6, v2
+; GFX6-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
+; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v5
+; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v4
+; GFX6-NEXT: v_add_f32_e32 v7, v7, v3
+; GFX6-NEXT: v_add_f32_e32 v6, v6, v2
+; GFX6-NEXT: v_alignbit_b32 v5, v4, v5, 16
+; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v7
+; GFX6-NEXT: v_alignbit_b32 v4, v4, v6, 16
+; GFX6-NEXT: v_mov_b32_e32 v7, v5
+; GFX6-NEXT: v_mov_b32_e32 v6, v4
; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 offset:2044 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v3
-; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v6
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5
+; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v6
; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v6
; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT: s_cbranch_execnz .LBB51_1
+; GFX6-NEXT: s_cbranch_execnz .LBB85_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
-; GFX6-NEXT: v_mov_b32_e32 v0, v3
-; GFX6-NEXT: v_mov_b32_e32 v1, v2
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr <2 x bfloat>, ptr addrspace(1) %ptr, i64 511
- %result = atomicrmw fadd ptr addrspace(1) %gep, <2 x bfloat> %val syncscope("agent") seq_cst
- ret <2 x bfloat> %result
+ %unused = atomicrmw fadd ptr addrspace(1) %gep, <2 x bfloat> %val seq_cst, !amdgpu.no.fine.grained.memory !0
+ ret void
}
-define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__offset12b_neg(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fadd_ret_v2bf16__offset12b_neg:
+define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memory:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: s_wait_expcnt 0x0
@@ -13705,30 +20587,30 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__offset12b_neg(ptr addr
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off offset:-2048 th:TH_ATOMIC_RETURN
+; GFX12-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
-; GFX940-LABEL: global_agent_atomic_fadd_ret_v2bf16__offset12b_neg:
+; GFX940-LABEL: global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memory:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off offset:-2048 sc0
+; GFX940-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off sc0
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc1
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: global_agent_atomic_fadd_ret_v2bf16__offset12b_neg:
+; GFX11-LABEL: global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:-2048
+; GFX11-NEXT: global_load_b32 v3, v[0:1], off
; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX11-NEXT: s_mov_b32 s1, 0
; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB52_1: ; %atomicrmw.start
+; GFX11-NEXT: .LBB86_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v6, v3
@@ -13753,7 +20635,7 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__offset12b_neg(ptr addr
; GFX11-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0
; GFX11-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:-2048 glc
+; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
@@ -13761,21 +20643,21 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__offset12b_neg(ptr addr
; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT: s_cbranch_execnz .LBB52_1
+; GFX11-NEXT: s_cbranch_execnz .LBB86_1
; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX11-NEXT: s_set_inst_prefetch_distance 0x2
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1
; GFX11-NEXT: v_mov_b32_e32 v0, v3
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
-; GFX10-LABEL: global_agent_atomic_fadd_ret_v2bf16__offset12b_neg:
+; GFX10-LABEL: global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memory:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:-2048
+; GFX10-NEXT: global_load_dword v3, v[0:1], off
; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX10-NEXT: s_mov_b32 s5, 0
-; GFX10-NEXT: .LBB52_1: ; %atomicrmw.start
+; GFX10-NEXT: .LBB86_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v6, v3
@@ -13795,29 +20677,29 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__offset12b_neg(ptr addr
; GFX10-NEXT: v_cndmask_b32_e64 v3, v7, v9, s4
; GFX10-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off offset:-2048 glc
+; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5
-; GFX10-NEXT: s_cbranch_execnz .LBB52_1
+; GFX10-NEXT: s_cbranch_execnz .LBB86_1
; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5
; GFX10-NEXT: v_mov_b32_e32 v0, v3
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX90A-LABEL: global_agent_atomic_fadd_ret_v2bf16__offset12b_neg:
+; GFX90A-LABEL: global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:-2048
+; GFX90A-NEXT: global_load_dword v3, v[0:1], off
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
-; GFX90A-NEXT: .LBB52_1: ; %atomicrmw.start
+; GFX90A-NEXT: .LBB86_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v7, v3
@@ -13836,28 +20718,28 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__offset12b_neg(ptr addr
; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5]
; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9
-; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off offset:-2048 glc
+; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
-; GFX90A-NEXT: s_cbranch_execnz .LBB52_1
+; GFX90A-NEXT: s_cbranch_execnz .LBB86_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX90A-NEXT: v_mov_b32_e32 v0, v3
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
-; GFX908-LABEL: global_agent_atomic_fadd_ret_v2bf16__offset12b_neg:
+; GFX908-LABEL: global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:-2048
+; GFX908-NEXT: global_load_dword v3, v[0:1], off
; GFX908-NEXT: s_mov_b64 s[6:7], 0
; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX908-NEXT: s_movk_i32 s8, 0x7fff
; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX908-NEXT: s_mov_b32 s9, 0x7060302
-; GFX908-NEXT: .LBB52_1: ; %atomicrmw.start
+; GFX908-NEXT: .LBB86_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: v_mov_b32_e32 v6, v3
@@ -13876,162 +20758,156 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__offset12b_neg(ptr addr
; GFX908-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5]
; GFX908-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
; GFX908-NEXT: v_perm_b32 v5, v5, v3, s9
-; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off offset:-2048 glc
+; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6
; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7]
-; GFX908-NEXT: s_cbranch_execnz .LBB52_1
+; GFX908-NEXT: s_cbranch_execnz .LBB86_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX908-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX908-NEXT: v_mov_b32_e32 v0, v3
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
-; GFX8-LABEL: global_agent_atomic_fadd_ret_v2bf16__offset12b_neg:
+; GFX8-LABEL: global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memory:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0xfffff800, v0
-; GFX8-NEXT: v_addc_u32_e32 v4, vcc, -1, v1, vcc
-; GFX8-NEXT: flat_load_dword v0, v[3:4]
+; GFX8-NEXT: flat_load_dword v3, v[0:1]
; GFX8-NEXT: s_mov_b64 s[6:7], 0
-; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX8-NEXT: .LBB52_1: ; %atomicrmw.start
+; GFX8-NEXT: .LBB86_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v6, v0
-; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v6
+; GFX8-NEXT: v_mov_b32_e32 v6, v3
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v6
; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
-; GFX8-NEXT: v_add_f32_e32 v0, v0, v1
+; GFX8-NEXT: v_add_f32_e32 v3, v3, v4
; GFX8-NEXT: v_add_f32_e32 v5, v5, v2
-; GFX8-NEXT: v_bfe_u32 v7, v0, 16, 1
+; GFX8-NEXT: v_bfe_u32 v7, v3, 16, 1
; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1
-; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v0
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v3
; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5
; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7
; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9
; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v0
-; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
+; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v3
+; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
-; GFX8-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5]
+; GFX8-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5]
; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT: v_alignbit_b32 v5, v5, v0, 16
-; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
+; GFX8-NEXT: v_alignbit_b32 v5, v5, v3, 16
+; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[5:6] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6
; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7]
-; GFX8-NEXT: s_cbranch_execnz .LBB52_1
+; GFX8-NEXT: s_cbranch_execnz .LBB86_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX8-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX8-NEXT: v_mov_b32_e32 v0, v3
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX7-LABEL: global_agent_atomic_fadd_ret_v2bf16__offset12b_neg:
+; GFX7-LABEL: global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memory:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: s_movk_i32 s4, 0xf800
-; GFX7-NEXT: s_mov_b32 s5, -1
-; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s6, 0
-; GFX7-NEXT: buffer_load_dword v6, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0xfffff800, v0
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v2
-; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GFX7-NEXT: s_mov_b64 s[8:9], 0
+; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s4, s6
-; GFX7-NEXT: v_addc_u32_e32 v5, vcc, -1, v1, vcc
-; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v0
-; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX7-NEXT: s_mov_b32 s5, s6
+; GFX7-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v3
+; GFX7-NEXT: s_mov_b64 s[8:9], 0
+; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v6
-; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v6
-; GFX7-NEXT: .LBB52_1: ; %atomicrmw.start
+; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v5
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5
+; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX7-NEXT: .LBB86_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v1
-; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v0
-; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX7-NEXT: v_add_f32_e32 v7, v7, v3
-; GFX7-NEXT: v_add_f32_e32 v6, v6, v2
-; GFX7-NEXT: v_alignbit_b32 v1, v1, v0, 16
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v7
-; GFX7-NEXT: v_alignbit_b32 v0, v0, v6, 16
-; GFX7-NEXT: v_mov_b32_e32 v7, v1
-; GFX7-NEXT: v_mov_b32_e32 v6, v0
-; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v[4:5], s[4:7], 0 addr64 glc
+; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v2
+; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX7-NEXT: v_add_f32_e32 v7, v7, v5
+; GFX7-NEXT: v_add_f32_e32 v6, v6, v4
+; GFX7-NEXT: v_alignbit_b32 v3, v2, v3, 16
+; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v7
+; GFX7-NEXT: v_alignbit_b32 v2, v2, v6, 16
+; GFX7-NEXT: v_mov_b32_e32 v7, v3
+; GFX7-NEXT: v_mov_b32_e32 v6, v2
+; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v1
-; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v6
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v3
+; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v6
; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v6
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v6
; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT: s_cbranch_execnz .LBB52_1
+; GFX7-NEXT: s_cbranch_execnz .LBB86_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
+; GFX7-NEXT: v_mov_b32_e32 v0, v3
+; GFX7-NEXT: v_mov_b32_e32 v1, v2
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
-; GFX6-LABEL: global_agent_atomic_fadd_ret_v2bf16__offset12b_neg:
+; GFX6-LABEL: global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memory:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT: s_movk_i32 s4, 0xf800
-; GFX6-NEXT: s_mov_b32 s5, -1
-; GFX6-NEXT: s_mov_b32 s7, 0xf000
; GFX6-NEXT: s_mov_b32 s6, 0
-; GFX6-NEXT: buffer_load_dword v6, v[0:1], s[4:7], 0 addr64
-; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0xfffff800, v0
-; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v2
-; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GFX6-NEXT: s_mov_b64 s[8:9], 0
+; GFX6-NEXT: s_mov_b32 s7, 0xf000
; GFX6-NEXT: s_mov_b32 s4, s6
-; GFX6-NEXT: v_addc_u32_e32 v5, vcc, -1, v1, vcc
-; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v0
-; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX6-NEXT: s_mov_b32 s5, s6
+; GFX6-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64
+; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GFX6-NEXT: v_mul_f32_e32 v6, 1.0, v3
+; GFX6-NEXT: s_mov_b64 s[8:9], 0
+; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v6
-; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v6
-; GFX6-NEXT: .LBB52_1: ; %atomicrmw.start
+; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v5
+; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5
+; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX6-NEXT: .LBB86_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v7, 0xffff0000, v1
-; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v0
-; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX6-NEXT: v_add_f32_e32 v7, v7, v3
-; GFX6-NEXT: v_add_f32_e32 v6, v6, v2
-; GFX6-NEXT: v_alignbit_b32 v1, v1, v0, 16
-; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v7
-; GFX6-NEXT: v_alignbit_b32 v0, v0, v6, 16
-; GFX6-NEXT: v_mov_b32_e32 v7, v1
-; GFX6-NEXT: v_mov_b32_e32 v6, v0
-; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v[4:5], s[4:7], 0 addr64 glc
+; GFX6-NEXT: v_and_b32_e32 v7, 0xffff0000, v2
+; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX6-NEXT: v_add_f32_e32 v7, v7, v5
+; GFX6-NEXT: v_add_f32_e32 v6, v6, v4
+; GFX6-NEXT: v_alignbit_b32 v3, v2, v3, 16
+; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v7
+; GFX6-NEXT: v_alignbit_b32 v2, v2, v6, 16
+; GFX6-NEXT: v_mov_b32_e32 v7, v3
+; GFX6-NEXT: v_mov_b32_e32 v6, v2
+; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v1
-; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v6
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v3
+; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v6
; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v6
+; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v6
; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT: s_cbranch_execnz .LBB52_1
+; GFX6-NEXT: s_cbranch_execnz .LBB86_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
+; GFX6-NEXT: v_mov_b32_e32 v0, v3
+; GFX6-NEXT: v_mov_b32_e32 v1, v2
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
- %gep = getelementptr <2 x bfloat>, ptr addrspace(1) %ptr, i64 -512
- %result = atomicrmw fadd ptr addrspace(1) %gep, <2 x bfloat> %val syncscope("agent") seq_cst
+ %result = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
ret <2 x bfloat> %result
}
-define void @global_agent_atomic_fadd_noret_v2bf16(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fadd_noret_v2bf16:
+define void @global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: s_wait_expcnt 0x0
@@ -14044,7 +20920,7 @@ define void @global_agent_atomic_fadd_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
-; GFX940-LABEL: global_agent_atomic_fadd_noret_v2bf16:
+; GFX940-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_wbl2 sc1
@@ -14053,7 +20929,7 @@ define void @global_agent_atomic_fadd_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b
; GFX940-NEXT: buffer_inv sc1
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: global_agent_atomic_fadd_noret_v2bf16:
+; GFX11-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: global_load_b32 v3, v[0:1], off
@@ -14062,7 +20938,7 @@ define void @global_agent_atomic_fadd_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b
; GFX11-NEXT: s_mov_b32 s1, 0
; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB53_1: ; %atomicrmw.start
+; GFX11-NEXT: .LBB87_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3
@@ -14094,20 +20970,20 @@ define void @global_agent_atomic_fadd_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b
; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT: s_cbranch_execnz .LBB53_1
+; GFX11-NEXT: s_cbranch_execnz .LBB87_1
; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX11-NEXT: s_set_inst_prefetch_distance 0x2
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
-; GFX10-LABEL: global_agent_atomic_fadd_noret_v2bf16:
+; GFX10-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: global_load_dword v3, v[0:1], off
; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX10-NEXT: s_mov_b32 s5, 0
-; GFX10-NEXT: .LBB53_1: ; %atomicrmw.start
+; GFX10-NEXT: .LBB87_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3
@@ -14134,12 +21010,12 @@ define void @global_agent_atomic_fadd_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b
; GFX10-NEXT: v_mov_b32_e32 v3, v2
; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5
-; GFX10-NEXT: s_cbranch_execnz .LBB53_1
+; GFX10-NEXT: s_cbranch_execnz .LBB87_1
; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX90A-LABEL: global_agent_atomic_fadd_noret_v2bf16:
+; GFX90A-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: global_load_dword v3, v[0:1], off
@@ -14148,7 +21024,7 @@ define void @global_agent_atomic_fadd_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
-; GFX90A-NEXT: .LBB53_1: ; %atomicrmw.start
+; GFX90A-NEXT: .LBB87_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3
@@ -14173,12 +21049,12 @@ define void @global_agent_atomic_fadd_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
-; GFX90A-NEXT: s_cbranch_execnz .LBB53_1
+; GFX90A-NEXT: s_cbranch_execnz .LBB87_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
-; GFX908-LABEL: global_agent_atomic_fadd_noret_v2bf16:
+; GFX908-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: global_load_dword v3, v[0:1], off
@@ -14187,7 +21063,7 @@ define void @global_agent_atomic_fadd_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b
; GFX908-NEXT: s_movk_i32 s8, 0x7fff
; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX908-NEXT: s_mov_b32 s9, 0x7060302
-; GFX908-NEXT: .LBB53_1: ; %atomicrmw.start
+; GFX908-NEXT: .LBB87_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3
@@ -14212,19 +21088,19 @@ define void @global_agent_atomic_fadd_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b
; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX908-NEXT: v_mov_b32_e32 v3, v2
; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7]
-; GFX908-NEXT: s_cbranch_execnz .LBB53_1
+; GFX908-NEXT: s_cbranch_execnz .LBB87_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX908-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
-; GFX8-LABEL: global_agent_atomic_fadd_noret_v2bf16:
+; GFX8-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: flat_load_dword v3, v[0:1]
; GFX8-NEXT: s_mov_b64 s[6:7], 0
; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
-; GFX8-NEXT: .LBB53_1: ; %atomicrmw.start
+; GFX8-NEXT: .LBB87_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3
@@ -14252,12 +21128,12 @@ define void @global_agent_atomic_fadd_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b
; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX8-NEXT: v_mov_b32_e32 v3, v2
; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7]
-; GFX8-NEXT: s_cbranch_execnz .LBB53_1
+; GFX8-NEXT: s_cbranch_execnz .LBB87_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX8-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX7-LABEL: global_agent_atomic_fadd_noret_v2bf16:
+; GFX7-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: s_mov_b32 s6, 0
@@ -14273,7 +21149,7 @@ define void @global_agent_atomic_fadd_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5
; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX7-NEXT: .LBB53_1: ; %atomicrmw.start
+; GFX7-NEXT: .LBB87_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
@@ -14295,12 +21171,12 @@ define void @global_agent_atomic_fadd_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b
; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v6
; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT: s_cbranch_execnz .LBB53_1
+; GFX7-NEXT: s_cbranch_execnz .LBB87_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
-; GFX6-LABEL: global_agent_atomic_fadd_noret_v2bf16:
+; GFX6-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: s_mov_b32 s6, 0
@@ -14316,7 +21192,7 @@ define void @global_agent_atomic_fadd_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v5
; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX6-NEXT: .LBB53_1: ; %atomicrmw.start
+; GFX6-NEXT: .LBB87_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GFX6-NEXT: v_mul_f32_e32 v5, 1.0, v5
@@ -14339,17 +21215,17 @@ define void @global_agent_atomic_fadd_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b
; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v6
; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT: s_cbranch_execnz .LBB53_1
+; GFX6-NEXT: s_cbranch_execnz .LBB87_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
- %unused = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %val syncscope("agent") seq_cst
+ %unused = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
ret void
}
-define void @global_agent_atomic_fadd_noret_v2bf16__offset12b_pos(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_pos:
+define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: s_wait_expcnt 0x0
@@ -14357,320 +21233,327 @@ define void @global_agent_atomic_fadd_noret_v2bf16__offset12b_pos(ptr addrspace(
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_pk_add_bf16 v[0:1], v2, off offset:2044
-; GFX12-NEXT: s_wait_storecnt 0x0
+; GFX12-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN
+; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
-; GFX940-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_pos:
+; GFX940-LABEL: global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: global_atomic_pk_add_bf16 v[0:1], v2, off offset:2044
+; GFX940-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off sc0
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc1
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_pos:
+; GFX11-LABEL: global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044
+; GFX11-NEXT: global_load_b32 v3, v[0:1], off
; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
+; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX11-NEXT: s_mov_b32 s1, 0
; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB54_1: ; %atomicrmw.start
+; GFX11-NEXT: .LBB88_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX11-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_add_f32_e32 v2, v2, v4
-; GFX11-NEXT: v_add_f32_e32 v6, v6, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX11-NEXT: v_bfe_u32 v8, v6, 16, 1
-; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v2
-; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v6
-; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX11-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
-; GFX11-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
-; GFX11-NEXT: v_cmp_u_f32_e64 s0, v2, v2
+; GFX11-NEXT: v_mov_b32_e32 v6, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX11-NEXT: v_add_f32_e32 v5, v5, v2
+; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v6
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
-; GFX11-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
+; GFX11-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX11-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX11-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
+; GFX11-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX11-NEXT: v_cmp_u_f32_e64 s0, v3, v3
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX11-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0
+; GFX11-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 glc
+; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX11-NEXT: v_mov_b32_e32 v3, v2
+; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT: s_cbranch_execnz .LBB54_1
+; GFX11-NEXT: s_cbranch_execnz .LBB88_1
; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX11-NEXT: s_set_inst_prefetch_distance 0x2
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1
+; GFX11-NEXT: v_mov_b32_e32 v0, v3
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
-; GFX10-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_pos:
+; GFX10-LABEL: global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044
+; GFX10-NEXT: global_load_dword v3, v[0:1], off
; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
+; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX10-NEXT: s_mov_b32 s5, 0
-; GFX10-NEXT: .LBB54_1: ; %atomicrmw.start
+; GFX10-NEXT: .LBB88_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX10-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
-; GFX10-NEXT: v_add_f32_e32 v2, v2, v4
-; GFX10-NEXT: v_add_f32_e32 v6, v6, v5
-; GFX10-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX10-NEXT: v_bfe_u32 v8, v6, 16, 1
-; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v2
-; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v6
-; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6
-; GFX10-NEXT: v_add3_u32 v7, v7, v2, 0x7fff
-; GFX10-NEXT: v_add3_u32 v8, v8, v6, 0x7fff
-; GFX10-NEXT: v_cmp_u_f32_e64 s4, v2, v2
-; GFX10-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e64 v2, v7, v9, s4
-; GFX10-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
+; GFX10-NEXT: v_mov_b32_e32 v6, v3
+; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX10-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX10-NEXT: v_add_f32_e32 v5, v5, v2
+; GFX10-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX10-NEXT: v_bfe_u32 v8, v5, 16, 1
+; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v3
+; GFX10-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX10-NEXT: v_add3_u32 v7, v7, v3, 0x7fff
+; GFX10-NEXT: v_add3_u32 v8, v8, v5, 0x7fff
+; GFX10-NEXT: v_cmp_u_f32_e64 s4, v3, v3
+; GFX10-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e64 v3, v7, v9, s4
+; GFX10-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc
+; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
-; GFX10-NEXT: v_mov_b32_e32 v3, v2
+; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5
-; GFX10-NEXT: s_cbranch_execnz .LBB54_1
+; GFX10-NEXT: s_cbranch_execnz .LBB88_1
; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5
+; GFX10-NEXT: v_mov_b32_e32 v0, v3
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX90A-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_pos:
+; GFX90A-LABEL: global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044
+; GFX90A-NEXT: global_load_dword v3, v[0:1], off
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
-; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
+; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
-; GFX90A-NEXT: .LBB54_1: ; %atomicrmw.start
+; GFX90A-NEXT: .LBB88_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX90A-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
-; GFX90A-NEXT: v_add_f32_e32 v2, v2, v4
-; GFX90A-NEXT: v_add_f32_e32 v6, v6, v5
-; GFX90A-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX90A-NEXT: v_bfe_u32 v9, v6, 16, 1
-; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v6
-; GFX90A-NEXT: v_add3_u32 v7, v7, v2, s8
-; GFX90A-NEXT: v_add3_u32 v9, v9, v6, s8
-; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
-; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
-; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5]
-; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
-; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9
-; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc
+; GFX90A-NEXT: v_mov_b32_e32 v7, v3
+; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v7
+; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7
+; GFX90A-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX90A-NEXT: v_add_f32_e32 v5, v5, v2
+; GFX90A-NEXT: v_bfe_u32 v6, v3, 16, 1
+; GFX90A-NEXT: v_bfe_u32 v9, v5, 16, 1
+; GFX90A-NEXT: v_or_b32_e32 v8, 0x400000, v3
+; GFX90A-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX90A-NEXT: v_add3_u32 v6, v6, v3, s8
+; GFX90A-NEXT: v_add3_u32 v9, v9, v5, s8
+; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
+; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5]
+; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9
+; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
-; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
-; GFX90A-NEXT: s_cbranch_execnz .LBB54_1
+; GFX90A-NEXT: s_cbranch_execnz .LBB88_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v3
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
-; GFX908-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_pos:
+; GFX908-LABEL: global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044
+; GFX908-NEXT: global_load_dword v3, v[0:1], off
; GFX908-NEXT: s_mov_b64 s[6:7], 0
; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX908-NEXT: s_movk_i32 s8, 0x7fff
-; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
+; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX908-NEXT: s_mov_b32 s9, 0x7060302
-; GFX908-NEXT: .LBB54_1: ; %atomicrmw.start
+; GFX908-NEXT: .LBB88_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
-; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX908-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
-; GFX908-NEXT: v_add_f32_e32 v2, v2, v4
-; GFX908-NEXT: v_add_f32_e32 v6, v6, v5
-; GFX908-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX908-NEXT: v_bfe_u32 v9, v6, 16, 1
-; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v6
-; GFX908-NEXT: v_add3_u32 v7, v7, v2, s8
-; GFX908-NEXT: v_add3_u32 v9, v9, v6, s8
-; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
-; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
-; GFX908-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5]
-; GFX908-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
-; GFX908-NEXT: v_perm_b32 v2, v6, v2, s9
-; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc
+; GFX908-NEXT: v_mov_b32_e32 v6, v3
+; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX908-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX908-NEXT: v_add_f32_e32 v5, v5, v2
+; GFX908-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX908-NEXT: v_bfe_u32 v9, v5, 16, 1
+; GFX908-NEXT: v_or_b32_e32 v8, 0x400000, v3
+; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX908-NEXT: v_add3_u32 v7, v7, v3, s8
+; GFX908-NEXT: v_add3_u32 v9, v9, v5, s8
+; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
+; GFX908-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5]
+; GFX908-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX908-NEXT: v_perm_b32 v5, v5, v3, s9
+; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
-; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6
; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX908-NEXT: v_mov_b32_e32 v3, v2
; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7]
-; GFX908-NEXT: s_cbranch_execnz .LBB54_1
+; GFX908-NEXT: s_cbranch_execnz .LBB88_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX908-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX908-NEXT: v_mov_b32_e32 v0, v3
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
-; GFX8-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_pos:
+; GFX8-LABEL: global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0
-; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v3, v[0:1]
; GFX8-NEXT: s_mov_b64 s[6:7], 0
; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
-; GFX8-NEXT: .LBB54_1: ; %atomicrmw.start
+; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX8-NEXT: .LBB88_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX8-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
-; GFX8-NEXT: v_add_f32_e32 v2, v2, v4
-; GFX8-NEXT: v_add_f32_e32 v6, v6, v5
-; GFX8-NEXT: v_bfe_u32 v7, v2, 16, 1
-; GFX8-NEXT: v_bfe_u32 v9, v6, 16, 1
-; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v2
-; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v6
+; GFX8-NEXT: v_mov_b32_e32 v6, v3
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v6
+; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX8-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX8-NEXT: v_add_f32_e32 v5, v5, v2
+; GFX8-NEXT: v_bfe_u32 v7, v3, 16, 1
+; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v3
+; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5
; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7
; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9
-; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v6
-; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v6, v6
-; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v2
-; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v2, v2
-; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
-; GFX8-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5]
-; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v6
-; GFX8-NEXT: v_alignbit_b32 v2, v6, v2, 16
-; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5
+; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
+; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v3
+; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
+; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5]
+; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
+; GFX8-NEXT: v_alignbit_b32 v5, v5, v3, 16
+; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[5:6] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6
; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX8-NEXT: v_mov_b32_e32 v3, v2
; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7]
-; GFX8-NEXT: s_cbranch_execnz .LBB54_1
+; GFX8-NEXT: s_cbranch_execnz .LBB88_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX8-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX8-NEXT: v_mov_b32_e32 v0, v3
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX7-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_pos:
+; GFX7-LABEL: global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: s_mov_b32 s6, 0
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
-; GFX7-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:2044
+; GFX7-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v3
; GFX7-NEXT: s_mov_b64 s[8:9], 0
-; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5
-; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX7-NEXT: .LBB54_1: ; %atomicrmw.start
+; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v5
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5
+; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX7-NEXT: .LBB88_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
-; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
-; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v5
-; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; GFX7-NEXT: v_add_f32_e32 v7, v7, v3
-; GFX7-NEXT: v_add_f32_e32 v6, v6, v2
-; GFX7-NEXT: v_alignbit_b32 v5, v4, v5, 16
-; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v7
-; GFX7-NEXT: v_alignbit_b32 v4, v4, v6, 16
-; GFX7-NEXT: v_mov_b32_e32 v7, v5
-; GFX7-NEXT: v_mov_b32_e32 v6, v4
-; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 offset:2044 glc
+; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v2
+; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX7-NEXT: v_add_f32_e32 v7, v7, v5
+; GFX7-NEXT: v_add_f32_e32 v6, v6, v4
+; GFX7-NEXT: v_alignbit_b32 v3, v2, v3, 16
+; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v7
+; GFX7-NEXT: v_alignbit_b32 v2, v2, v6, 16
+; GFX7-NEXT: v_mov_b32_e32 v7, v3
+; GFX7-NEXT: v_mov_b32_e32 v6, v2
+; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5
-; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v6
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v3
+; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v6
; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v6
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v6
; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT: s_cbranch_execnz .LBB54_1
+; GFX7-NEXT: s_cbranch_execnz .LBB88_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
+; GFX7-NEXT: v_mov_b32_e32 v0, v3
+; GFX7-NEXT: v_mov_b32_e32 v1, v2
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
-; GFX6-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_pos:
+; GFX6-LABEL: global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: s_mov_b32 s6, 0
; GFX6-NEXT: s_mov_b32 s7, 0xf000
; GFX6-NEXT: s_mov_b32 s4, s6
; GFX6-NEXT: s_mov_b32 s5, s6
-; GFX6-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:2044
+; GFX6-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64
; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GFX6-NEXT: v_mul_f32_e32 v6, 1.0, v3
; GFX6-NEXT: s_mov_b64 s[8:9], 0
-; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
+; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v5
-; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX6-NEXT: .LBB54_1: ; %atomicrmw.start
+; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v5
+; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5
+; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
+; GFX6-NEXT: .LBB88_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4
-; GFX6-NEXT: v_mul_f32_e32 v5, 1.0, v5
+; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v7, 0xffff0000, v4
-; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v5
-; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; GFX6-NEXT: v_add_f32_e32 v7, v7, v3
-; GFX6-NEXT: v_add_f32_e32 v6, v6, v2
-; GFX6-NEXT: v_alignbit_b32 v5, v4, v5, 16
-; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v7
-; GFX6-NEXT: v_alignbit_b32 v4, v4, v6, 16
-; GFX6-NEXT: v_mov_b32_e32 v7, v5
-; GFX6-NEXT: v_mov_b32_e32 v6, v4
-; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 offset:2044 glc
+; GFX6-NEXT: v_and_b32_e32 v7, 0xffff0000, v2
+; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
+; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX6-NEXT: v_add_f32_e32 v7, v7, v5
+; GFX6-NEXT: v_add_f32_e32 v6, v6, v4
+; GFX6-NEXT: v_alignbit_b32 v3, v2, v3, 16
+; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v7
+; GFX6-NEXT: v_alignbit_b32 v2, v2, v6, 16
+; GFX6-NEXT: v_mov_b32_e32 v7, v3
+; GFX6-NEXT: v_mov_b32_e32 v6, v2
+; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5
-; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v6
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v3
+; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v6
; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v6
+; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v6
; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT: s_cbranch_execnz .LBB54_1
+; GFX6-NEXT: s_cbranch_execnz .LBB88_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
+; GFX6-NEXT: v_mov_b32_e32 v0, v3
+; GFX6-NEXT: v_mov_b32_e32 v1, v2
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
- %gep = getelementptr <2 x bfloat>, ptr addrspace(1) %ptr, i64 511
- %unused = atomicrmw fadd ptr addrspace(1) %gep, <2 x bfloat> %val syncscope("agent") seq_cst
- ret void
+ %result = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
+ ret <2 x bfloat> %result
}
-define void @global_agent_atomic_fadd_noret_v2bf16__offset12b_neg(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 {
-; GFX12-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_neg:
+define void @global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 {
+; GFX12-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: s_wait_expcnt 0x0
@@ -14678,30 +21561,30 @@ define void @global_agent_atomic_fadd_noret_v2bf16__offset12b_neg(ptr addrspace(
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_pk_add_bf16 v[0:1], v2, off offset:-2048
+; GFX12-NEXT: global_atomic_pk_add_bf16 v[0:1], v2, off
; GFX12-NEXT: s_wait_storecnt 0x0
; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
-; GFX940-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_neg:
+; GFX940-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_wbl2 sc1
-; GFX940-NEXT: global_atomic_pk_add_bf16 v[0:1], v2, off offset:-2048
+; GFX940-NEXT: global_atomic_pk_add_bf16 v[0:1], v2, off
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc1
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_neg:
+; GFX11-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:-2048
+; GFX11-NEXT: global_load_b32 v3, v[0:1], off
; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX11-NEXT: s_mov_b32 s1, 0
; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB55_1: ; %atomicrmw.start
+; GFX11-NEXT: .LBB89_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3
@@ -14724,7 +21607,7 @@ define void @global_agent_atomic_fadd_noret_v2bf16__offset12b_neg(ptr addrspace(
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:-2048 glc
+; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
@@ -14733,20 +21616,20 @@ define void @global_agent_atomic_fadd_noret_v2bf16__offset12b_neg(ptr addrspace(
; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT: s_cbranch_execnz .LBB55_1
+; GFX11-NEXT: s_cbranch_execnz .LBB89_1
; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX11-NEXT: s_set_inst_prefetch_distance 0x2
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
-; GFX10-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_neg:
+; GFX10-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:-2048
+; GFX10-NEXT: global_load_dword v3, v[0:1], off
; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX10-NEXT: s_mov_b32 s5, 0
-; GFX10-NEXT: .LBB55_1: ; %atomicrmw.start
+; GFX10-NEXT: .LBB89_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3
@@ -14765,7 +21648,7 @@ define void @global_agent_atomic_fadd_noret_v2bf16__offset12b_neg(ptr addrspace(
; GFX10-NEXT: v_cndmask_b32_e64 v2, v7, v9, s4
; GFX10-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc
+; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
@@ -14773,21 +21656,21 @@ define void @global_agent_atomic_fadd_noret_v2bf16__offset12b_neg(ptr addrspace(
; GFX10-NEXT: v_mov_b32_e32 v3, v2
; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5
-; GFX10-NEXT: s_cbranch_execnz .LBB55_1
+; GFX10-NEXT: s_cbranch_execnz .LBB89_1
; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX90A-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_neg:
+; GFX90A-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:-2048
+; GFX90A-NEXT: global_load_dword v3, v[0:1], off
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
-; GFX90A-NEXT: .LBB55_1: ; %atomicrmw.start
+; GFX90A-NEXT: .LBB89_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3
@@ -14805,28 +21688,28 @@ define void @global_agent_atomic_fadd_noret_v2bf16__offset12b_neg(ptr addrspace(
; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5]
; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9
-; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc
+; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
-; GFX90A-NEXT: s_cbranch_execnz .LBB55_1
+; GFX90A-NEXT: s_cbranch_execnz .LBB89_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
-; GFX908-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_neg:
+; GFX908-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:-2048
+; GFX908-NEXT: global_load_dword v3, v[0:1], off
; GFX908-NEXT: s_mov_b64 s[6:7], 0
; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX908-NEXT: s_movk_i32 s8, 0x7fff
; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX908-NEXT: s_mov_b32 s9, 0x7060302
-; GFX908-NEXT: .LBB55_1: ; %atomicrmw.start
+; GFX908-NEXT: .LBB89_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3
@@ -14844,28 +21727,26 @@ define void @global_agent_atomic_fadd_noret_v2bf16__offset12b_neg(ptr addrspace(
; GFX908-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5]
; GFX908-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
; GFX908-NEXT: v_perm_b32 v2, v6, v2, s9
-; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-2048 glc
+; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX908-NEXT: v_mov_b32_e32 v3, v2
; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7]
-; GFX908-NEXT: s_cbranch_execnz .LBB55_1
+; GFX908-NEXT: s_cbranch_execnz .LBB89_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX908-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
-; GFX8-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_neg:
+; GFX8-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0xfffff800, v0
-; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc
; GFX8-NEXT: flat_load_dword v3, v[0:1]
; GFX8-NEXT: s_mov_b64 s[6:7], 0
; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
-; GFX8-NEXT: .LBB55_1: ; %atomicrmw.start
+; GFX8-NEXT: .LBB89_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3
@@ -14893,32 +21774,28 @@ define void @global_agent_atomic_fadd_noret_v2bf16__offset12b_neg(ptr addrspace(
; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX8-NEXT: v_mov_b32_e32 v3, v2
; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7]
-; GFX8-NEXT: s_cbranch_execnz .LBB55_1
+; GFX8-NEXT: s_cbranch_execnz .LBB89_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX8-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX7-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_neg:
+; GFX7-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: s_movk_i32 s4, 0xf800
-; GFX7-NEXT: s_mov_b32 s5, -1
-; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s6, 0
+; GFX7-NEXT: s_mov_b32 s7, 0xf000
+; GFX7-NEXT: s_mov_b32 s4, s6
+; GFX7-NEXT: s_mov_b32 s5, s6
; GFX7-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GFX7-NEXT: s_mov_b64 s[8:9], 0
-; GFX7-NEXT: s_mov_b32 s4, s6
-; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc
; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GFX7-NEXT: s_mov_b32 s5, s6
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5
; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX7-NEXT: .LBB55_1: ; %atomicrmw.start
+; GFX7-NEXT: .LBB89_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
@@ -14940,32 +21817,28 @@ define void @global_agent_atomic_fadd_noret_v2bf16__offset12b_neg(ptr addrspace(
; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v6
; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT: s_cbranch_execnz .LBB55_1
+; GFX7-NEXT: s_cbranch_execnz .LBB89_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
-; GFX6-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_neg:
+; GFX6-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT: s_movk_i32 s4, 0xf800
-; GFX6-NEXT: s_mov_b32 s5, -1
-; GFX6-NEXT: s_mov_b32 s7, 0xf000
; GFX6-NEXT: s_mov_b32 s6, 0
+; GFX6-NEXT: s_mov_b32 s7, 0xf000
+; GFX6-NEXT: s_mov_b32 s4, s6
+; GFX6-NEXT: s_mov_b32 s5, s6
; GFX6-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64
-; GFX6-NEXT: v_add_i32_e32 v0, vcc, 0xfffff800, v0
; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GFX6-NEXT: s_mov_b64 s[8:9], 0
-; GFX6-NEXT: s_mov_b32 s4, s6
-; GFX6-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc
; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GFX6-NEXT: s_mov_b32 s5, s6
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v5
; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX6-NEXT: .LBB55_1: ; %atomicrmw.start
+; GFX6-NEXT: .LBB89_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GFX6-NEXT: v_mul_f32_e32 v5, 1.0, v5
@@ -14988,18 +21861,17 @@ define void @global_agent_atomic_fadd_noret_v2bf16__offset12b_neg(ptr addrspace(
; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v6
; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT: s_cbranch_execnz .LBB55_1
+; GFX6-NEXT: s_cbranch_execnz .LBB89_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
- %gep = getelementptr <2 x bfloat>, ptr addrspace(1) %ptr, i64 -512
- %unused = atomicrmw fadd ptr addrspace(1) %gep, <2 x bfloat> %val syncscope("agent") seq_cst
+ %unused = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
ret void
}
-define <2 x bfloat> @global_system_atomic_fadd_ret_v2bf16__offset12b_pos(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 {
-; GFX12-LABEL: global_system_atomic_fadd_ret_v2bf16__offset12b_pos:
+define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__maybe_remote(ptr addrspace(1) %ptr, <2 x bfloat> %val) {
+; GFX12-LABEL: global_agent_atomic_fadd_ret_v2bf16__maybe_remote:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: s_wait_expcnt 0x0
@@ -15007,30 +21879,30 @@ define <2 x bfloat> @global_system_atomic_fadd_ret_v2bf16__offset12b_pos(ptr add
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN
+; GFX12-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN
; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_SYS
+; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
-; GFX940-LABEL: global_system_atomic_fadd_ret_v2bf16__offset12b_pos:
+; GFX940-LABEL: global_agent_atomic_fadd_ret_v2bf16__maybe_remote:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: buffer_wbl2 sc0 sc1
-; GFX940-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off offset:2044 sc0 sc1
+; GFX940-NEXT: buffer_wbl2 sc1
+; GFX940-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off sc0
; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: buffer_inv sc0 sc1
+; GFX940-NEXT: buffer_inv sc1
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: global_system_atomic_fadd_ret_v2bf16__offset12b_pos:
+; GFX11-LABEL: global_agent_atomic_fadd_ret_v2bf16__maybe_remote:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044
+; GFX11-NEXT: global_load_b32 v3, v[0:1], off
; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX11-NEXT: s_mov_b32 s1, 0
; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB56_1: ; %atomicrmw.start
+; GFX11-NEXT: .LBB90_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v6, v3
@@ -15055,7 +21927,7 @@ define <2 x bfloat> @global_system_atomic_fadd_ret_v2bf16__offset12b_pos(ptr add
; GFX11-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0
; GFX11-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:2044 glc
+; GFX11-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
@@ -15063,21 +21935,21 @@ define <2 x bfloat> @global_system_atomic_fadd_ret_v2bf16__offset12b_pos(ptr add
; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT: s_cbranch_execnz .LBB56_1
+; GFX11-NEXT: s_cbranch_execnz .LBB90_1
; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX11-NEXT: s_set_inst_prefetch_distance 0x2
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1
; GFX11-NEXT: v_mov_b32_e32 v0, v3
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
-; GFX10-LABEL: global_system_atomic_fadd_ret_v2bf16__offset12b_pos:
+; GFX10-LABEL: global_agent_atomic_fadd_ret_v2bf16__maybe_remote:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044
+; GFX10-NEXT: global_load_dword v3, v[0:1], off
; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX10-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX10-NEXT: s_mov_b32 s5, 0
-; GFX10-NEXT: .LBB56_1: ; %atomicrmw.start
+; GFX10-NEXT: .LBB90_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v6, v3
@@ -15097,29 +21969,29 @@ define <2 x bfloat> @global_system_atomic_fadd_ret_v2bf16__offset12b_pos(ptr add
; GFX10-NEXT: v_cndmask_b32_e64 v3, v7, v9, s4
; GFX10-NEXT: v_perm_b32 v5, v5, v3, 0x7060302
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off offset:2044 glc
+; GFX10-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5
-; GFX10-NEXT: s_cbranch_execnz .LBB56_1
+; GFX10-NEXT: s_cbranch_execnz .LBB90_1
; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5
; GFX10-NEXT: v_mov_b32_e32 v0, v3
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX90A-LABEL: global_system_atomic_fadd_ret_v2bf16__offset12b_pos:
+; GFX90A-LABEL: global_agent_atomic_fadd_ret_v2bf16__maybe_remote:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044
+; GFX90A-NEXT: global_load_dword v3, v[0:1], off
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
; GFX90A-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
-; GFX90A-NEXT: .LBB56_1: ; %atomicrmw.start
+; GFX90A-NEXT: .LBB90_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v7, v3
@@ -15138,30 +22010,28 @@ define <2 x bfloat> @global_system_atomic_fadd_ret_v2bf16__offset12b_pos(ptr add
; GFX90A-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[4:5]
; GFX90A-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
; GFX90A-NEXT: v_perm_b32 v6, v5, v3, s9
-; GFX90A-NEXT: buffer_wbl2
-; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off offset:2044 glc
+; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: buffer_invl2
; GFX90A-NEXT: buffer_wbinvl1
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
-; GFX90A-NEXT: s_cbranch_execnz .LBB56_1
+; GFX90A-NEXT: s_cbranch_execnz .LBB90_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX90A-NEXT: v_mov_b32_e32 v0, v3
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
-; GFX908-LABEL: global_system_atomic_fadd_ret_v2bf16__offset12b_pos:
+; GFX908-LABEL: global_agent_atomic_fadd_ret_v2bf16__maybe_remote:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044
+; GFX908-NEXT: global_load_dword v3, v[0:1], off
; GFX908-NEXT: s_mov_b64 s[6:7], 0
; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX908-NEXT: s_movk_i32 s8, 0x7fff
; GFX908-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX908-NEXT: s_mov_b32 s9, 0x7060302
-; GFX908-NEXT: .LBB56_1: ; %atomicrmw.start
+; GFX908-NEXT: .LBB90_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: v_mov_b32_e32 v6, v3
@@ -15180,68 +22050,67 @@ define <2 x bfloat> @global_system_atomic_fadd_ret_v2bf16__offset12b_pos(ptr add
; GFX908-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5]
; GFX908-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
; GFX908-NEXT: v_perm_b32 v5, v5, v3, s9
-; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off offset:2044 glc
+; GFX908-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6
; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7]
-; GFX908-NEXT: s_cbranch_execnz .LBB56_1
+; GFX908-NEXT: s_cbranch_execnz .LBB90_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX908-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX908-NEXT: v_mov_b32_e32 v0, v3
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
-; GFX8-LABEL: global_system_atomic_fadd_ret_v2bf16__offset12b_pos:
+; GFX8-LABEL: global_agent_atomic_fadd_ret_v2bf16__maybe_remote:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_add_u32_e32 v3, vcc, 0x7fc, v0
-; GFX8-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_load_dword v0, v[3:4]
+; GFX8-NEXT: flat_load_dword v3, v[0:1]
; GFX8-NEXT: s_mov_b64 s[6:7], 0
-; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v2
+; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX8-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX8-NEXT: .LBB56_1: ; %atomicrmw.start
+; GFX8-NEXT: .LBB90_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_mov_b32_e32 v6, v0
-; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v6
+; GFX8-NEXT: v_mov_b32_e32 v6, v3
+; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v6
; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
-; GFX8-NEXT: v_add_f32_e32 v0, v0, v1
+; GFX8-NEXT: v_add_f32_e32 v3, v3, v4
; GFX8-NEXT: v_add_f32_e32 v5, v5, v2
-; GFX8-NEXT: v_bfe_u32 v7, v0, 16, 1
+; GFX8-NEXT: v_bfe_u32 v7, v3, 16, 1
; GFX8-NEXT: v_bfe_u32 v9, v5, 16, 1
-; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v0
+; GFX8-NEXT: v_add_u32_e32 v7, vcc, v7, v3
; GFX8-NEXT: v_add_u32_e32 v9, vcc, v9, v5
; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x7fff, v7
; GFX8-NEXT: v_add_u32_e32 v9, vcc, 0x7fff, v9
; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v5
; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
-; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v0
-; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
+; GFX8-NEXT: v_or_b32_e32 v8, 0x400000, v3
+; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v3, v3
; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc
-; GFX8-NEXT: v_cndmask_b32_e64 v0, v7, v8, s[4:5]
+; GFX8-NEXT: v_cndmask_b32_e64 v3, v7, v8, s[4:5]
; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX8-NEXT: v_alignbit_b32 v5, v5, v0, 16
-; GFX8-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
+; GFX8-NEXT: v_alignbit_b32 v5, v5, v3, 16
+; GFX8-NEXT: flat_atomic_cmpswap v3, v[0:1], v[5:6] glc
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: buffer_wbinvl1
-; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6
+; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v3, v6
; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7]
-; GFX8-NEXT: s_cbranch_execnz .LBB56_1
+; GFX8-NEXT: s_cbranch_execnz .LBB90_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX8-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX8-NEXT: v_mov_b32_e32 v0, v3
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX7-LABEL: global_system_atomic_fadd_ret_v2bf16__offset12b_pos:
+; GFX7-LABEL: global_agent_atomic_fadd_ret_v2bf16__maybe_remote:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: s_mov_b32 s6, 0
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
-; GFX7-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:2044
+; GFX7-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v3
; GFX7-NEXT: s_mov_b64 s[8:9], 0
@@ -15250,7 +22119,7 @@ define <2 x bfloat> @global_system_atomic_fadd_ret_v2bf16__offset12b_pos(ptr add
; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v5
; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5
; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
-; GFX7-NEXT: .LBB56_1: ; %atomicrmw.start
+; GFX7-NEXT: .LBB90_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
@@ -15264,7 +22133,7 @@ define <2 x bfloat> @global_system_atomic_fadd_ret_v2bf16__offset12b_pos(ptr add
; GFX7-NEXT: v_alignbit_b32 v2, v2, v6, 16
; GFX7-NEXT: v_mov_b32_e32 v7, v3
; GFX7-NEXT: v_mov_b32_e32 v6, v2
-; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 offset:2044 glc
+; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v3
@@ -15272,21 +22141,21 @@ define <2 x bfloat> @global_system_atomic_fadd_ret_v2bf16__offset12b_pos(ptr add
; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v6
; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT: s_cbranch_execnz .LBB56_1
+; GFX7-NEXT: s_cbranch_execnz .LBB90_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX7-NEXT: v_mov_b32_e32 v0, v3
; GFX7-NEXT: v_mov_b32_e32 v1, v2
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
-; GFX6-LABEL: global_system_atomic_fadd_ret_v2bf16__offset12b_pos:
+; GFX6-LABEL: global_agent_atomic_fadd_ret_v2bf16__maybe_remote:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: s_mov_b32 s6, 0
; GFX6-NEXT: s_mov_b32 s7, 0xf000
; GFX6-NEXT: s_mov_b32 s4, s6
; GFX6-NEXT: s_mov_b32 s5, s6
-; GFX6-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:2044
+; GFX6-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64
; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX6-NEXT: v_mul_f32_e32 v6, 1.0, v3
; GFX6-NEXT: s_mov_b64 s[8:9], 0
@@ -15295,7 +22164,7 @@ define <2 x bfloat> @global_system_atomic_fadd_ret_v2bf16__offset12b_pos(ptr add
; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v5
; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5
; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
-; GFX6-NEXT: .LBB56_1: ; %atomicrmw.start
+; GFX6-NEXT: .LBB90_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3
@@ -15310,7 +22179,7 @@ define <2 x bfloat> @global_system_atomic_fadd_ret_v2bf16__offset12b_pos(ptr add
; GFX6-NEXT: v_alignbit_b32 v2, v2, v6, 16
; GFX6-NEXT: v_mov_b32_e32 v7, v3
; GFX6-NEXT: v_mov_b32_e32 v6, v2
-; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 offset:2044 glc
+; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v3
@@ -15318,20 +22187,19 @@ define <2 x bfloat> @global_system_atomic_fadd_ret_v2bf16__offset12b_pos(ptr add
; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v6
; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT: s_cbranch_execnz .LBB56_1
+; GFX6-NEXT: s_cbranch_execnz .LBB90_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX6-NEXT: v_mov_b32_e32 v0, v3
; GFX6-NEXT: v_mov_b32_e32 v1, v2
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
- %gep = getelementptr <2 x bfloat>, ptr addrspace(1) %ptr, i64 511
- %result = atomicrmw fadd ptr addrspace(1) %gep, <2 x bfloat> %val seq_cst
+ %result = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %val syncscope("agent") seq_cst
ret <2 x bfloat> %result
}
-define void @global_system_atomic_fadd_noret_v2bf16__offset12b_pos(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 {
-; GFX12-LABEL: global_system_atomic_fadd_noret_v2bf16__offset12b_pos:
+define void @global_agent_atomic_fadd_noret_v2bf16__maybe_remote(ptr addrspace(1) %ptr, <2 x bfloat> %val) {
+; GFX12-LABEL: global_agent_atomic_fadd_noret_v2bf16__maybe_remote:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: s_wait_expcnt 0x0
@@ -15339,30 +22207,30 @@ define void @global_system_atomic_fadd_noret_v2bf16__offset12b_pos(ptr addrspace
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_pk_add_bf16 v[0:1], v2, off offset:2044
+; GFX12-NEXT: global_atomic_pk_add_bf16 v[0:1], v2, off
; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_SYS
+; GFX12-NEXT: global_inv scope:SCOPE_DEV
; GFX12-NEXT: s_setpc_b64 s[30:31]
;
-; GFX940-LABEL: global_system_atomic_fadd_noret_v2bf16__offset12b_pos:
+; GFX940-LABEL: global_agent_atomic_fadd_noret_v2bf16__maybe_remote:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: buffer_wbl2 sc0 sc1
-; GFX940-NEXT: global_atomic_pk_add_bf16 v[0:1], v2, off offset:2044 sc1
+; GFX940-NEXT: buffer_wbl2 sc1
+; GFX940-NEXT: global_atomic_pk_add_bf16 v[0:1], v2, off
; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: buffer_inv sc0 sc1
+; GFX940-NEXT: buffer_inv sc1
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: global_system_atomic_fadd_noret_v2bf16__offset12b_pos:
+; GFX11-LABEL: global_agent_atomic_fadd_noret_v2bf16__maybe_remote:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v3, v[0:1], off offset:2044
+; GFX11-NEXT: global_load_b32 v3, v[0:1], off
; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX11-NEXT: s_mov_b32 s1, 0
; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-NEXT: .p2align 6
-; GFX11-NEXT: .LBB57_1: ; %atomicrmw.start
+; GFX11-NEXT: .LBB91_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v3
@@ -15385,7 +22253,7 @@ define void @global_system_atomic_fadd_noret_v2bf16__offset12b_pos(ptr addrspace
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2044 glc
+; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
@@ -15394,20 +22262,20 @@ define void @global_system_atomic_fadd_noret_v2bf16__offset12b_pos(ptr addrspace
; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT: s_cbranch_execnz .LBB57_1
+; GFX11-NEXT: s_cbranch_execnz .LBB91_1
; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX11-NEXT: s_set_inst_prefetch_distance 0x2
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
-; GFX10-LABEL: global_system_atomic_fadd_noret_v2bf16__offset12b_pos:
+; GFX10-LABEL: global_agent_atomic_fadd_noret_v2bf16__maybe_remote:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_load_dword v3, v[0:1], off offset:2044
+; GFX10-NEXT: global_load_dword v3, v[0:1], off
; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX10-NEXT: s_mov_b32 s5, 0
-; GFX10-NEXT: .LBB57_1: ; %atomicrmw.start
+; GFX10-NEXT: .LBB91_1: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v3
@@ -15426,7 +22294,7 @@ define void @global_system_atomic_fadd_noret_v2bf16__offset12b_pos(ptr addrspace
; GFX10-NEXT: v_cndmask_b32_e64 v2, v7, v9, s4
; GFX10-NEXT: v_perm_b32 v2, v6, v2, 0x7060302
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc
+; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
@@ -15434,21 +22302,21 @@ define void @global_system_atomic_fadd_noret_v2bf16__offset12b_pos(ptr addrspace
; GFX10-NEXT: v_mov_b32_e32 v3, v2
; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5
-; GFX10-NEXT: s_cbranch_execnz .LBB57_1
+; GFX10-NEXT: s_cbranch_execnz .LBB91_1
; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
-; GFX90A-LABEL: global_system_atomic_fadd_noret_v2bf16__offset12b_pos:
+; GFX90A-LABEL: global_agent_atomic_fadd_noret_v2bf16__maybe_remote:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:2044
+; GFX90A-NEXT: global_load_dword v3, v[0:1], off
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX90A-NEXT: s_movk_i32 s8, 0x7fff
; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX90A-NEXT: s_mov_b32 s9, 0x7060302
-; GFX90A-NEXT: .LBB57_1: ; %atomicrmw.start
+; GFX90A-NEXT: .LBB91_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v3
@@ -15466,30 +22334,28 @@ define void @global_system_atomic_fadd_noret_v2bf16__offset12b_pos(ptr addrspace
; GFX90A-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5]
; GFX90A-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
; GFX90A-NEXT: v_perm_b32 v2, v6, v2, s9
-; GFX90A-NEXT: buffer_wbl2
-; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc
+; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: buffer_invl2
; GFX90A-NEXT: buffer_wbinvl1
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX90A-NEXT: v_mov_b32_e32 v3, v2
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
-; GFX90A-NEXT: s_cbranch_execnz .LBB57_1
+; GFX90A-NEXT: s_cbranch_execnz .LBB91_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
-; GFX908-LABEL: global_system_atomic_fadd_noret_v2bf16__offset12b_pos:
+; GFX908-LABEL: global_agent_atomic_fadd_noret_v2bf16__maybe_remote:
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT: global_load_dword v3, v[0:1], off offset:2044
+; GFX908-NEXT: global_load_dword v3, v[0:1], off
; GFX908-NEXT: s_mov_b64 s[6:7], 0
; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX908-NEXT: s_movk_i32 s8, 0x7fff
; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX908-NEXT: s_mov_b32 s9, 0x7060302
-; GFX908-NEXT: .LBB57_1: ; %atomicrmw.start
+; GFX908-NEXT: .LBB91_1: ; %atomicrmw.start
; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v3
@@ -15507,28 +22373,26 @@ define void @global_system_atomic_fadd_noret_v2bf16__offset12b_pos(ptr addrspace
; GFX908-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[4:5]
; GFX908-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc
; GFX908-NEXT: v_perm_b32 v2, v6, v2, s9
-; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:2044 glc
+; GFX908-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: buffer_wbinvl1
; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX908-NEXT: v_mov_b32_e32 v3, v2
; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7]
-; GFX908-NEXT: s_cbranch_execnz .LBB57_1
+; GFX908-NEXT: s_cbranch_execnz .LBB91_1
; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX908-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX908-NEXT: s_setpc_b64 s[30:31]
;
-; GFX8-LABEL: global_system_atomic_fadd_noret_v2bf16__offset12b_pos:
+; GFX8-LABEL: global_agent_atomic_fadd_noret_v2bf16__maybe_remote:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, 0x7fc, v0
-; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX8-NEXT: flat_load_dword v3, v[0:1]
; GFX8-NEXT: s_mov_b64 s[6:7], 0
; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
-; GFX8-NEXT: .LBB57_1: ; %atomicrmw.start
+; GFX8-NEXT: .LBB91_1: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v3
@@ -15556,19 +22420,19 @@ define void @global_system_atomic_fadd_noret_v2bf16__offset12b_pos(ptr addrspace
; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
; GFX8-NEXT: v_mov_b32_e32 v3, v2
; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7]
-; GFX8-NEXT: s_cbranch_execnz .LBB57_1
+; GFX8-NEXT: s_cbranch_execnz .LBB91_1
; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX8-NEXT: s_or_b64 exec, exec, s[6:7]
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
-; GFX7-LABEL: global_system_atomic_fadd_noret_v2bf16__offset12b_pos:
+; GFX7-LABEL: global_agent_atomic_fadd_noret_v2bf16__maybe_remote:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: s_mov_b32 s6, 0
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
-; GFX7-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:2044
+; GFX7-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GFX7-NEXT: s_mov_b64 s[8:9], 0
@@ -15577,7 +22441,7 @@ define void @global_system_atomic_fadd_noret_v2bf16__offset12b_pos(ptr addrspace
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5
; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX7-NEXT: .LBB57_1: ; %atomicrmw.start
+; GFX7-NEXT: .LBB91_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
@@ -15591,7 +22455,7 @@ define void @global_system_atomic_fadd_noret_v2bf16__offset12b_pos(ptr addrspace
; GFX7-NEXT: v_alignbit_b32 v4, v4, v6, 16
; GFX7-NEXT: v_mov_b32_e32 v7, v5
; GFX7-NEXT: v_mov_b32_e32 v6, v4
-; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 offset:2044 glc
+; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5
@@ -15599,19 +22463,19 @@ define void @global_system_atomic_fadd_noret_v2bf16__offset12b_pos(ptr addrspace
; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v6
; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
-; GFX7-NEXT: s_cbranch_execnz .LBB57_1
+; GFX7-NEXT: s_cbranch_execnz .LBB91_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
-; GFX6-LABEL: global_system_atomic_fadd_noret_v2bf16__offset12b_pos:
+; GFX6-LABEL: global_agent_atomic_fadd_noret_v2bf16__maybe_remote:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: s_mov_b32 s6, 0
; GFX6-NEXT: s_mov_b32 s7, 0xf000
; GFX6-NEXT: s_mov_b32 s4, s6
; GFX6-NEXT: s_mov_b32 s5, s6
-; GFX6-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64 offset:2044
+; GFX6-NEXT: buffer_load_dword v5, v[0:1], s[4:7], 0 addr64
; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GFX6-NEXT: s_mov_b64 s[8:9], 0
@@ -15620,7 +22484,7 @@ define void @global_system_atomic_fadd_noret_v2bf16__offset12b_pos(ptr addrspace
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v5
; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX6-NEXT: .LBB57_1: ; %atomicrmw.start
+; GFX6-NEXT: .LBB91_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GFX6-NEXT: v_mul_f32_e32 v5, 1.0, v5
@@ -15635,7 +22499,7 @@ define void @global_system_atomic_fadd_noret_v2bf16__offset12b_pos(ptr addrspace
; GFX6-NEXT: v_alignbit_b32 v4, v4, v6, 16
; GFX6-NEXT: v_mov_b32_e32 v7, v5
; GFX6-NEXT: v_mov_b32_e32 v6, v4
-; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 offset:2044 glc
+; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v[0:1], s[4:7], 0 addr64 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5
@@ -15643,13 +22507,12 @@ define void @global_system_atomic_fadd_noret_v2bf16__offset12b_pos(ptr addrspace
; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v6
; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
-; GFX6-NEXT: s_cbranch_execnz .LBB57_1
+; GFX6-NEXT: s_cbranch_execnz .LBB91_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
- %gep = getelementptr <2 x bfloat>, ptr addrspace(1) %ptr, i64 511
- %unused = atomicrmw fadd ptr addrspace(1) %gep, <2 x bfloat> %val seq_cst
+ %unused = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %val syncscope("agent") seq_cst
ret void
}
@@ -15665,7 +22528,7 @@ define amdgpu_kernel void @infer_as_before_atomic(ptr addrspace(4) %arg) #1 {
; GFX12-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_cmpx_eq_u32_e32 0, v0
-; GFX12-NEXT: s_cbranch_execz .LBB58_2
+; GFX12-NEXT: s_cbranch_execz .LBB92_2
; GFX12-NEXT: ; %bb.1:
; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
; GFX12-NEXT: s_bcnt1_i32_b32 s2, s2
@@ -15675,7 +22538,7 @@ define amdgpu_kernel void @infer_as_before_atomic(ptr addrspace(4) %arg) #1 {
; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: global_atomic_add_f32 v0, v1, s[0:1]
-; GFX12-NEXT: .LBB58_2:
+; GFX12-NEXT: .LBB92_2:
; GFX12-NEXT: s_nop 0
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-NEXT: s_endpgm
@@ -15687,7 +22550,7 @@ define amdgpu_kernel void @infer_as_before_atomic(ptr addrspace(4) %arg) #1 {
; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc
-; GFX940-NEXT: s_cbranch_execz .LBB58_2
+; GFX940-NEXT: s_cbranch_execz .LBB92_2
; GFX940-NEXT: ; %bb.1:
; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
; GFX940-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
@@ -15697,7 +22560,7 @@ define amdgpu_kernel void @infer_as_before_atomic(ptr addrspace(4) %arg) #1 {
; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-NEXT: global_atomic_add_f32 v0, v1, s[0:1]
-; GFX940-NEXT: .LBB58_2:
+; GFX940-NEXT: .LBB92_2:
; GFX940-NEXT: s_endpgm
;
; GFX11-LABEL: infer_as_before_atomic:
@@ -15707,7 +22570,7 @@ define amdgpu_kernel void @infer_as_before_atomic(ptr addrspace(4) %arg) #1 {
; GFX11-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0
-; GFX11-NEXT: s_cbranch_execz .LBB58_2
+; GFX11-NEXT: s_cbranch_execz .LBB92_2
; GFX11-NEXT: ; %bb.1:
; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
; GFX11-NEXT: s_bcnt1_i32_b32 s2, s2
@@ -15717,7 +22580,7 @@ define amdgpu_kernel void @infer_as_before_atomic(ptr addrspace(4) %arg) #1 {
; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: global_atomic_add_f32 v0, v1, s[0:1]
-; GFX11-NEXT: .LBB58_2:
+; GFX11-NEXT: .LBB92_2:
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -15729,7 +22592,7 @@ define amdgpu_kernel void @infer_as_before_atomic(ptr addrspace(4) %arg) #1 {
; GFX10-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX10-NEXT: s_and_saveexec_b32 s4, vcc_lo
-; GFX10-NEXT: s_cbranch_execz .LBB58_3
+; GFX10-NEXT: s_cbranch_execz .LBB92_3
; GFX10-NEXT: ; %bb.1:
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
; GFX10-NEXT: s_bcnt1_i32_b32 s3, s3
@@ -15741,7 +22604,7 @@ define amdgpu_kernel void @infer_as_before_atomic(ptr addrspace(4) %arg) #1 {
; GFX10-NEXT: s_load_dword s4, s[0:1], 0x0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v1, s4
-; GFX10-NEXT: .LBB58_2: ; %atomicrmw.start
+; GFX10-NEXT: .LBB92_2: ; %atomicrmw.start
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10-NEXT: v_add_f32_e32 v0, v1, v2
; GFX10-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
@@ -15750,8 +22613,8 @@ define amdgpu_kernel void @infer_as_before_atomic(ptr addrspace(4) %arg) #1 {
; GFX10-NEXT: v_mov_b32_e32 v1, v0
; GFX10-NEXT: s_or_b32 s2, vcc_lo, s2
; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
-; GFX10-NEXT: s_cbranch_execnz .LBB58_2
-; GFX10-NEXT: .LBB58_3:
+; GFX10-NEXT: s_cbranch_execnz .LBB92_2
+; GFX10-NEXT: .LBB92_3:
; GFX10-NEXT: s_endpgm
;
; GFX90A-LABEL: infer_as_before_atomic:
@@ -15761,7 +22624,7 @@ define amdgpu_kernel void @infer_as_before_atomic(ptr addrspace(4) %arg) #1 {
; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
-; GFX90A-NEXT: s_cbranch_execz .LBB58_2
+; GFX90A-NEXT: s_cbranch_execz .LBB92_2
; GFX90A-NEXT: ; %bb.1:
; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
; GFX90A-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
@@ -15771,7 +22634,7 @@ define amdgpu_kernel void @infer_as_before_atomic(ptr addrspace(4) %arg) #1 {
; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: global_atomic_add_f32 v0, v1, s[0:1]
-; GFX90A-NEXT: .LBB58_2:
+; GFX90A-NEXT: .LBB92_2:
; GFX90A-NEXT: s_endpgm
;
; GFX908-LABEL: infer_as_before_atomic:
@@ -15781,7 +22644,7 @@ define amdgpu_kernel void @infer_as_before_atomic(ptr addrspace(4) %arg) #1 {
; GFX908-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX908-NEXT: s_and_saveexec_b64 s[4:5], vcc
-; GFX908-NEXT: s_cbranch_execz .LBB58_2
+; GFX908-NEXT: s_cbranch_execz .LBB92_2
; GFX908-NEXT: ; %bb.1:
; GFX908-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
; GFX908-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
@@ -15791,7 +22654,7 @@ define amdgpu_kernel void @infer_as_before_atomic(ptr addrspace(4) %arg) #1 {
; GFX908-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
; GFX908-NEXT: global_atomic_add_f32 v0, v1, s[0:1]
-; GFX908-NEXT: .LBB58_2:
+; GFX908-NEXT: .LBB92_2:
; GFX908-NEXT: s_endpgm
;
; GFX8-LABEL: infer_as_before_atomic:
@@ -15801,7 +22664,7 @@ define amdgpu_kernel void @infer_as_before_atomic(ptr addrspace(4) %arg) #1 {
; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc
-; GFX8-NEXT: s_cbranch_execz .LBB58_3
+; GFX8-NEXT: s_cbranch_execz .LBB92_3
; GFX8-NEXT: ; %bb.1:
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
; GFX8-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
@@ -15815,7 +22678,7 @@ define amdgpu_kernel void @infer_as_before_atomic(ptr addrspace(4) %arg) #1 {
; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v3, s6
-; GFX8-NEXT: .LBB58_2: ; %atomicrmw.start
+; GFX8-NEXT: .LBB92_2: ; %atomicrmw.start
; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX8-NEXT: v_add_f32_e32 v2, v3, v4
; GFX8-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
@@ -15824,8 +22687,8 @@ define amdgpu_kernel void @infer_as_before_atomic(ptr addrspace(4) %arg) #1 {
; GFX8-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX8-NEXT: v_mov_b32_e32 v3, v2
; GFX8-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX8-NEXT: s_cbranch_execnz .LBB58_2
-; GFX8-NEXT: .LBB58_3:
+; GFX8-NEXT: s_cbranch_execnz .LBB92_2
+; GFX8-NEXT: .LBB92_3:
; GFX8-NEXT: s_endpgm
;
; GFX7-LABEL: infer_as_before_atomic:
@@ -15835,7 +22698,7 @@ define amdgpu_kernel void @infer_as_before_atomic(ptr addrspace(4) %arg) #1 {
; GFX7-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX7-NEXT: s_and_saveexec_b64 s[4:5], vcc
-; GFX7-NEXT: s_cbranch_execz .LBB58_3
+; GFX7-NEXT: s_cbranch_execz .LBB92_3
; GFX7-NEXT: ; %bb.1:
; GFX7-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
; GFX7-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
@@ -15849,7 +22712,7 @@ define amdgpu_kernel void @infer_as_before_atomic(ptr addrspace(4) %arg) #1 {
; GFX7-NEXT: s_load_dword s6, s[0:1], 0x0
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v1, s6
-; GFX7-NEXT: .LBB58_2: ; %atomicrmw.start
+; GFX7-NEXT: .LBB92_2: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: v_add_f32_e32 v0, v1, v2
; GFX7-NEXT: v_mov_b32_e32 v4, v1
@@ -15860,8 +22723,8 @@ define amdgpu_kernel void @infer_as_before_atomic(ptr addrspace(4) %arg) #1 {
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX7-NEXT: v_mov_b32_e32 v1, v3
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX7-NEXT: s_cbranch_execnz .LBB58_2
-; GFX7-NEXT: .LBB58_3:
+; GFX7-NEXT: s_cbranch_execnz .LBB92_2
+; GFX7-NEXT: .LBB92_3:
; GFX7-NEXT: s_endpgm
;
; GFX6-LABEL: infer_as_before_atomic:
@@ -15871,7 +22734,7 @@ define amdgpu_kernel void @infer_as_before_atomic(ptr addrspace(4) %arg) #1 {
; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0
; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX6-NEXT: s_and_saveexec_b64 s[4:5], vcc
-; GFX6-NEXT: s_cbranch_execz .LBB58_3
+; GFX6-NEXT: s_cbranch_execz .LBB92_3
; GFX6-NEXT: ; %bb.1:
; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
; GFX6-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
@@ -15885,7 +22748,7 @@ define amdgpu_kernel void @infer_as_before_atomic(ptr addrspace(4) %arg) #1 {
; GFX6-NEXT: s_load_dword s6, s[0:1], 0x0
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: v_mov_b32_e32 v1, s6
-; GFX6-NEXT: .LBB58_2: ; %atomicrmw.start
+; GFX6-NEXT: .LBB92_2: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: v_add_f32_e32 v0, v1, v2
; GFX6-NEXT: s_waitcnt expcnt(0)
@@ -15897,13 +22760,15 @@ define amdgpu_kernel void @infer_as_before_atomic(ptr addrspace(4) %arg) #1 {
; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX6-NEXT: v_mov_b32_e32 v1, v3
; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GFX6-NEXT: s_cbranch_execnz .LBB58_2
-; GFX6-NEXT: .LBB58_3:
+; GFX6-NEXT: s_cbranch_execnz .LBB92_2
+; GFX6-NEXT: .LBB92_3:
; GFX6-NEXT: s_endpgm
%load = load ptr, ptr addrspace(4) %arg
- %v = atomicrmw fadd ptr %load, float 1.0 syncscope("agent-one-as") monotonic, align 4
+ %v = atomicrmw fadd ptr %load, float 1.0 syncscope("agent-one-as") monotonic, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0
ret void
}
attributes #0 = { nounwind "amdgpu-unsafe-fp-atomics"="true" }
attributes #1 = { nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" "amdgpu-unsafe-fp-atomics"="true" }
+
+!0 = !{}
More information about the llvm-commits
mailing list