[llvm] [AMDGPU] gfx1250 runlines for global-atomicrmw-fadd.ll. NFC (PR #159817)
Stanislav Mekhanoshin via llvm-commits
llvm-commits at lists.llvm.org
Fri Sep 19 10:24:09 PDT 2025
https://github.com/rampitec created https://github.com/llvm/llvm-project/pull/159817
None
>From 129604889abcd7f1bcd2694c395bc538d31e1541 Mon Sep 17 00:00:00 2001
From: Stanislav Mekhanoshin <Stanislav.Mekhanoshin at amd.com>
Date: Fri, 19 Sep 2025 10:23:10 -0700
Subject: [PATCH] [AMDGPU] gfx1250 runlines for global-atomicrmw-fadd.ll. NFC
---
.../CodeGen/AMDGPU/global-atomicrmw-fadd.ll | 2582 +++++++++++++++++
1 file changed, 2582 insertions(+)
diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll
index 2cad8eeea33cf..b67a1c513c49f 100644
--- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll
@@ -1,4 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1250 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-TRUE16 %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1250 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-FAKE16 %s
; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-TRUE16 %s
; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-FAKE16 %s
; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx942 < %s | FileCheck -check-prefix=GFX942 %s
@@ -16,6 +18,18 @@
; --------------------------------------------------------------------
define float @global_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #0 {
+; GFX1250-LABEL: global_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+;
; GFX12-LABEL: global_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -201,6 +215,18 @@ define float @global_agent_atomic_fadd_ret_f32__amdgpu_no_fine_grained_memory(pt
}
define float @global_agent_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #0 {
+; GFX1250-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+;
; GFX12-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -388,6 +414,18 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_gr
}
define float @global_agent_atomic_fadd_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #0 {
+; GFX1250-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+;
; GFX12-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -585,6 +623,18 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_neg__amdgpu_no_fine_gr
}
define void @global_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #0 {
+; GFX1250-LABEL: global_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_atomic_add_f32 v[0:1], v2, off scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+;
; GFX12-LABEL: global_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -764,6 +814,18 @@ define void @global_agent_atomic_fadd_noret_f32__amdgpu_no_fine_grained_memory(p
}
define void @global_agent_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #0 {
+; GFX1250-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044 scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+;
; GFX12-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -946,6 +1008,18 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_g
}
define void @global_agent_atomic_fadd_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #0 {
+; GFX1250-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:-2048 scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+;
; GFX12-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -1136,6 +1210,18 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_neg__amdgpu_no_fine_g
}
define float @global_system_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #0 {
+; GFX1250-LABEL: global_system_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+;
; GFX12-LABEL: global_system_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -1326,6 +1412,18 @@ define float @global_system_atomic_fadd_ret_f32__offset12b_pos__amdgpu_no_fine_g
}
define void @global_system_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #0 {
+; GFX1250-LABEL: global_system_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044 scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+;
; GFX12-LABEL: global_system_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -1511,6 +1609,18 @@ define void @global_system_atomic_fadd_noret_f32__offset12b_pos__amdgpu_no_fine_
}
define float @global_agent_atomic_fadd_ret_f32_maybe_remote(ptr addrspace(1) %ptr, float %val) #0 {
+; GFX1250-LABEL: global_agent_atomic_fadd_ret_f32_maybe_remote:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+;
; GFX12-LABEL: global_agent_atomic_fadd_ret_f32_maybe_remote:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -1714,6 +1824,18 @@ define float @global_agent_atomic_fadd_ret_f32_maybe_remote(ptr addrspace(1) %pt
}
define float @global_agent_atomic_fadd_ret_f32_maybe_remote__amdgpu_ignore_denormal_mode(ptr addrspace(1) %ptr, float %val) #0 {
+; GFX1250-LABEL: global_agent_atomic_fadd_ret_f32_maybe_remote__amdgpu_ignore_denormal_mode:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+;
; GFX12-LABEL: global_agent_atomic_fadd_ret_f32_maybe_remote__amdgpu_ignore_denormal_mode:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -1917,6 +2039,18 @@ define float @global_agent_atomic_fadd_ret_f32_maybe_remote__amdgpu_ignore_denor
}
define void @global_agent_atomic_fadd_noret_f32_maybe_remote__amdgpu_ignore_denormal_mode(ptr addrspace(1) %ptr, float %val) #0 {
+; GFX1250-LABEL: global_agent_atomic_fadd_noret_f32_maybe_remote__amdgpu_ignore_denormal_mode:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044 scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+;
; GFX12-LABEL: global_agent_atomic_fadd_noret_f32_maybe_remote__amdgpu_ignore_denormal_mode:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -2113,6 +2247,18 @@ define void @global_agent_atomic_fadd_noret_f32_maybe_remote__amdgpu_ignore_deno
}
define float @global_agent_atomic_fadd_ret_f32___amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #0 {
+; GFX1250-LABEL: global_agent_atomic_fadd_ret_f32___amdgpu_no_fine_grained_memory:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+;
; GFX12-LABEL: global_agent_atomic_fadd_ret_f32___amdgpu_no_fine_grained_memory:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -2300,6 +2446,18 @@ define float @global_agent_atomic_fadd_ret_f32___amdgpu_no_fine_grained_memory(p
}
define float @global_agent_atomic_fadd_ret_f32___amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode(ptr addrspace(1) %ptr, float %val) #0 {
+; GFX1250-LABEL: global_agent_atomic_fadd_ret_f32___amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+;
; GFX12-LABEL: global_agent_atomic_fadd_ret_f32___amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -2473,6 +2631,18 @@ define float @global_agent_atomic_fadd_ret_f32___amdgpu_no_fine_grained_memory__
}
define float @global_agent_atomic_fadd_ret_f32_amdgpu_ignore_denormal_mode(ptr addrspace(1) %ptr, float %val) #0 {
+; GFX1250-LABEL: global_agent_atomic_fadd_ret_f32_amdgpu_ignore_denormal_mode:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+;
; GFX12-LABEL: global_agent_atomic_fadd_ret_f32_amdgpu_ignore_denormal_mode:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -2676,6 +2846,18 @@ define float @global_agent_atomic_fadd_ret_f32_amdgpu_ignore_denormal_mode(ptr a
}
define void @global_agent_atomic_fadd_noret_f32_maybe_remote(ptr addrspace(1) %ptr, float %val) #0 {
+; GFX1250-LABEL: global_agent_atomic_fadd_noret_f32_maybe_remote:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044 scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+;
; GFX12-LABEL: global_agent_atomic_fadd_noret_f32_maybe_remote:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -2872,6 +3054,18 @@ define void @global_agent_atomic_fadd_noret_f32_maybe_remote(ptr addrspace(1) %p
}
define void @global_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #0 {
+; GFX1250-LABEL: global_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044 scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+;
; GFX12-LABEL: global_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -3054,6 +3248,18 @@ define void @global_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory(
}
define void @global_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode(ptr addrspace(1) %ptr, float %val) #0 {
+; GFX1250-LABEL: global_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044 scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+;
; GFX12-LABEL: global_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -3210,6 +3416,18 @@ define void @global_agent_atomic_fadd_noret_f32___amdgpu_no_fine_grained_memory_
}
define void @global_agent_atomic_fadd_noret_f32_amdgpu_ignore_denormal_mode(ptr addrspace(1) %ptr, float %val) #0 {
+; GFX1250-LABEL: global_agent_atomic_fadd_noret_f32_amdgpu_ignore_denormal_mode:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044 scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+;
; GFX12-LABEL: global_agent_atomic_fadd_noret_f32_amdgpu_ignore_denormal_mode:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -3406,6 +3624,18 @@ define void @global_agent_atomic_fadd_noret_f32_amdgpu_ignore_denormal_mode(ptr
}
define float @global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %val) #0 {
+; GFX1250-LABEL: global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+;
; GFX12-LABEL: global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -3607,6 +3837,18 @@ define float @global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory(ptr addr
}
define void @global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %val) #0 {
+; GFX1250-LABEL: global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_atomic_add_f32 v[0:1], v2, off scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+;
; GFX12-LABEL: global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -3800,6 +4042,18 @@ define void @global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory(ptr add
}
define float @global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode(ptr addrspace(1) %ptr, float %val) #0 {
+; GFX1250-LABEL: global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+;
; GFX12-LABEL: global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -4001,6 +4255,18 @@ define float @global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_
}
define void @global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode(ptr addrspace(1) %ptr, float %val) #0 {
+; GFX1250-LABEL: global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_atomic_add_f32 v[0:1], v2, off scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+;
; GFX12-LABEL: global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu_ignore_denormal_mode:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -4194,6 +4460,18 @@ define void @global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu
}
define float @global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #0 {
+; GFX1250-LABEL: global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_no_fine_grained_memory:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+;
; GFX12-LABEL: global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_no_fine_grained_memory:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -4379,6 +4657,18 @@ define float @global_agent_atomic_fadd_ret_f32__amdgpu_no_remote_memory__amdgpu_
}
define void @global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #0 {
+; GFX1250-LABEL: global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu_no_fine_grained_memory:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_atomic_add_f32 v[0:1], v2, off scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+;
; GFX12-LABEL: global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu_no_fine_grained_memory:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -4562,6 +4852,18 @@ define void @global_agent_atomic_fadd_noret_f32__amdgpu_no_remote_memory__amdgpu
; --------------------------------------------------------------------
define float @global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #1 {
+; GFX1250-LABEL: global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_fine_grained_memory:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+;
; GFX12-LABEL: global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_fine_grained_memory:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -4733,6 +5035,18 @@ define float @global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_fine_grained_memo
}
define float @global_agent_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #1 {
+; GFX1250-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+;
; GFX12-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -4906,6 +5220,18 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fi
}
define float @global_agent_atomic_fadd_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #1 {
+; GFX1250-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+;
; GFX12-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -5089,6 +5415,18 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_neg__ftz__amdgpu_no_fi
}
define void @global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #1 {
+; GFX1250-LABEL: global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_fine_grained_memory:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_atomic_add_f32 v[0:1], v2, off scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+;
; GFX12-LABEL: global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_fine_grained_memory:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -5242,6 +5580,18 @@ define void @global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_fine_grained_mem
}
define void @global_agent_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #1 {
+; GFX1250-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044 scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+;
; GFX12-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -5398,6 +5748,18 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_f
}
define void @global_agent_atomic_fadd_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #1 {
+; GFX1250-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:-2048 scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+;
; GFX12-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_neg__ftz__amdgpu_no_fine_grained_memory:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -5562,6 +5924,18 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_neg__ftz__amdgpu_no_f
}
define float @global_system_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #1 {
+; GFX1250-LABEL: global_system_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+;
; GFX12-LABEL: global_system_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -5738,6 +6112,18 @@ define float @global_system_atomic_fadd_ret_f32__offset12b_pos__ftz__amdgpu_no_f
}
define void @global_system_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, float %val) #1 {
+; GFX1250-LABEL: global_system_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044 scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+;
; GFX12-LABEL: global_system_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_fine_grained_memory:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -5897,6 +6283,18 @@ define void @global_system_atomic_fadd_noret_f32__offset12b_pos__ftz__amdgpu_no_
}
define float @global_agent_atomic_fadd_ret_f32__offset12b_pos__ieee__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode(ptr addrspace(1) %ptr, float %val) #0 {
+; GFX1250-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos__ieee__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+;
; GFX12-LABEL: global_agent_atomic_fadd_ret_f32__offset12b_pos__ieee__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -6070,6 +6468,18 @@ define float @global_agent_atomic_fadd_ret_f32__offset12b_pos__ieee__amdgpu_no_f
}
define void @global_agent_atomic_fadd_noret_f32__offset12b_pos__ieee__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode(ptr addrspace(1) %ptr, float %val) #0 {
+; GFX1250-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos__ieee__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_atomic_add_f32 v[0:1], v2, off offset:2044 scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+;
; GFX12-LABEL: global_agent_atomic_fadd_noret_f32__offset12b_pos__ieee__amdgpu_no_fine_grained_memory__amdgpu_ignore_denormal_mode:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -6226,6 +6636,18 @@ define void @global_agent_atomic_fadd_noret_f32__offset12b_pos__ieee__amdgpu_no_
}
define float @global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %val) #1 {
+; GFX1250-LABEL: global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_remote_memory:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+;
; GFX12-LABEL: global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_remote_memory:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -6427,6 +6849,18 @@ define float @global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_remote_memory(ptr
}
define void @global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %val) #1 {
+; GFX1250-LABEL: global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_remote_memory:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_atomic_add_f32 v[0:1], v2, off scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+;
; GFX12-LABEL: global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_remote_memory:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -6620,6 +7054,18 @@ define void @global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_remote_memory(pt
}
define float @global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %val) #1 {
+; GFX1250-LABEL: global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_atomic_add_f32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+;
; GFX12-LABEL: global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -6791,6 +7237,18 @@ define float @global_agent_atomic_fadd_ret_f32__ftz__amdgpu_no_fine_grained_memo
}
define void @global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, float %val) #1 {
+; GFX1250-LABEL: global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_atomic_add_f32 v[0:1], v2, off scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+;
; GFX12-LABEL: global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -6948,6 +7406,18 @@ define void @global_agent_atomic_fadd_noret_f32__ftz__amdgpu_no_fine_grained_mem
; --------------------------------------------------------------------
define double @global_agent_atomic_fadd_ret_f64__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, double %val) #0 {
+; GFX1250-LABEL: global_agent_atomic_fadd_ret_f64__amdgpu_no_fine_grained_memory:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_atomic_add_f64 v[0:1], v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+;
; GFX12-LABEL: global_agent_atomic_fadd_ret_f64__amdgpu_no_fine_grained_memory:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -7170,6 +7640,18 @@ define double @global_agent_atomic_fadd_ret_f64__amdgpu_no_fine_grained_memory(p
}
define double @global_agent_atomic_fadd_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, double %val) #0 {
+; GFX1250-LABEL: global_agent_atomic_fadd_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_atomic_add_f64 v[0:1], v[0:1], v[2:3], off offset:2040 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+;
; GFX12-LABEL: global_agent_atomic_fadd_ret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -7393,6 +7875,18 @@ define double @global_agent_atomic_fadd_ret_f64__offset12b_pos__amdgpu_no_fine_g
}
define double @global_agent_atomic_fadd_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, double %val) #0 {
+; GFX1250-LABEL: global_agent_atomic_fadd_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_atomic_add_f64 v[0:1], v[0:1], v[2:3], off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+;
; GFX12-LABEL: global_agent_atomic_fadd_ret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -7624,6 +8118,18 @@ define double @global_agent_atomic_fadd_ret_f64__offset12b_neg__amdgpu_no_fine_g
}
define void @global_agent_atomic_fadd_noret_f64__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, double %val) #0 {
+; GFX1250-LABEL: global_agent_atomic_fadd_noret_f64__amdgpu_no_fine_grained_memory:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_atomic_add_f64 v[0:1], v[2:3], off scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+;
; GFX12-LABEL: global_agent_atomic_fadd_noret_f64__amdgpu_no_fine_grained_memory:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -7828,6 +8334,18 @@ define void @global_agent_atomic_fadd_noret_f64__amdgpu_no_fine_grained_memory(p
}
define void @global_agent_atomic_fadd_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, double %val) #0 {
+; GFX1250-LABEL: global_agent_atomic_fadd_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_atomic_add_f64 v[0:1], v[2:3], off offset:2040 scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+;
; GFX12-LABEL: global_agent_atomic_fadd_noret_f64__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -8035,6 +8553,18 @@ define void @global_agent_atomic_fadd_noret_f64__offset12b_pos__amdgpu_no_fine_g
}
define void @global_agent_atomic_fadd_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, double %val) #0 {
+; GFX1250-LABEL: global_agent_atomic_fadd_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_atomic_add_f64 v[0:1], v[2:3], off offset:-2048 scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+;
; GFX12-LABEL: global_agent_atomic_fadd_noret_f64__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -8254,6 +8784,92 @@ define void @global_agent_atomic_fadd_noret_f64__offset12b_neg__amdgpu_no_fine_g
; --------------------------------------------------------------------
define half @global_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 {
+;
+; GFX1250-TRUE16-LABEL: global_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory:
+; GFX1250-TRUE16: ; %bb.0:
+; GFX1250-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX1250-TRUE16-NEXT: v_mov_b32_e32 v3, v0
+; GFX1250-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1250-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX1250-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX1250-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX1250-TRUE16-NEXT: global_load_b32 v5, v[0:1], off
+; GFX1250-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-TRUE16-NEXT: v_not_b32_e32 v4, v4
+; GFX1250-TRUE16-NEXT: .LBB44_1: ; %atomicrmw.start
+; GFX1250-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX1250-TRUE16-NEXT: v_mov_b32_e32 v7, v5
+; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1250-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v7
+; GFX1250-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
+; GFX1250-TRUE16-NEXT: v_add_f16_e32 v5.l, v5.l, v2.l
+; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX1250-TRUE16-NEXT: v_and_or_b32 v6, v7, v4, v5
+; GFX1250-TRUE16-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX1250-TRUE16-NEXT: s_wait_xcnt 0x0
+; GFX1250-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[6:7], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX1250-TRUE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7
+; GFX1250-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX1250-TRUE16-NEXT: s_wait_xcnt 0x0
+; GFX1250-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX1250-TRUE16-NEXT: s_cbranch_execnz .LBB44_1
+; GFX1250-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1250-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX1250-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX1250-TRUE16-NEXT: s_set_pc_i64 s[30:31]
+;
+; GFX1250-FAKE16-LABEL: global_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory:
+; GFX1250-FAKE16: ; %bb.0:
+; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX1250-FAKE16-NEXT: v_mov_b32_e32 v3, v0
+; GFX1250-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1250-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX1250-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX1250-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX1250-FAKE16-NEXT: global_load_b32 v5, v[0:1], off
+; GFX1250-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-FAKE16-NEXT: v_not_b32_e32 v4, v4
+; GFX1250-FAKE16-NEXT: .LBB44_1: ; %atomicrmw.start
+; GFX1250-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX1250-FAKE16-NEXT: v_mov_b32_e32 v7, v5
+; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v7
+; GFX1250-FAKE16-NEXT: v_add_f16_e32 v5, v5, v2
+; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX1250-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-FAKE16-NEXT: v_and_or_b32 v6, v7, v4, v5
+; GFX1250-FAKE16-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX1250-FAKE16-NEXT: s_wait_xcnt 0x0
+; GFX1250-FAKE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[6:7], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX1250-FAKE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7
+; GFX1250-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX1250-FAKE16-NEXT: s_wait_xcnt 0x0
+; GFX1250-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX1250-FAKE16-NEXT: s_cbranch_execnz .LBB44_1
+; GFX1250-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1250-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX1250-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31]
+;
; GFX12-TRUE16-LABEL: global_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory:
; GFX12-TRUE16: ; %bb.0:
; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -8676,6 +9292,92 @@ define half @global_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory(ptr
}
define half @global_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 {
+;
+; GFX1250-TRUE16-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX1250-TRUE16: ; %bb.0:
+; GFX1250-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX1250-TRUE16-NEXT: v_add_nc_u64_e32 v[4:5], 0x7fe, v[0:1]
+; GFX1250-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX1250-TRUE16-NEXT: v_dual_mov_b32 v1, v5 :: v_dual_bitop2_b32 v0, -4, v4 bitop3:0x40
+; GFX1250-TRUE16-NEXT: v_and_b32_e32 v3, 3, v4
+; GFX1250-TRUE16-NEXT: global_load_b32 v5, v[0:1], off
+; GFX1250-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX1250-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-TRUE16-NEXT: v_not_b32_e32 v4, v4
+; GFX1250-TRUE16-NEXT: .LBB45_1: ; %atomicrmw.start
+; GFX1250-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX1250-TRUE16-NEXT: v_mov_b32_e32 v7, v5
+; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1250-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v7
+; GFX1250-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
+; GFX1250-TRUE16-NEXT: v_add_f16_e32 v5.l, v5.l, v2.l
+; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX1250-TRUE16-NEXT: v_and_or_b32 v6, v7, v4, v5
+; GFX1250-TRUE16-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX1250-TRUE16-NEXT: s_wait_xcnt 0x0
+; GFX1250-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[6:7], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX1250-TRUE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7
+; GFX1250-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX1250-TRUE16-NEXT: s_wait_xcnt 0x0
+; GFX1250-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX1250-TRUE16-NEXT: s_cbranch_execnz .LBB45_1
+; GFX1250-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1250-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX1250-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX1250-TRUE16-NEXT: s_set_pc_i64 s[30:31]
+;
+; GFX1250-FAKE16-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX1250-FAKE16: ; %bb.0:
+; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX1250-FAKE16-NEXT: v_add_nc_u64_e32 v[4:5], 0x7fe, v[0:1]
+; GFX1250-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX1250-FAKE16-NEXT: v_dual_mov_b32 v1, v5 :: v_dual_bitop2_b32 v0, -4, v4 bitop3:0x40
+; GFX1250-FAKE16-NEXT: v_and_b32_e32 v3, 3, v4
+; GFX1250-FAKE16-NEXT: global_load_b32 v5, v[0:1], off
+; GFX1250-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX1250-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-FAKE16-NEXT: v_not_b32_e32 v4, v4
+; GFX1250-FAKE16-NEXT: .LBB45_1: ; %atomicrmw.start
+; GFX1250-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX1250-FAKE16-NEXT: v_mov_b32_e32 v7, v5
+; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v7
+; GFX1250-FAKE16-NEXT: v_add_f16_e32 v5, v5, v2
+; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX1250-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-FAKE16-NEXT: v_and_or_b32 v6, v7, v4, v5
+; GFX1250-FAKE16-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX1250-FAKE16-NEXT: s_wait_xcnt 0x0
+; GFX1250-FAKE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[6:7], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX1250-FAKE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7
+; GFX1250-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX1250-FAKE16-NEXT: s_wait_xcnt 0x0
+; GFX1250-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX1250-FAKE16-NEXT: s_cbranch_execnz .LBB45_1
+; GFX1250-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1250-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX1250-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31]
+;
; GFX12-TRUE16-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX12-TRUE16: ; %bb.0:
; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -9112,6 +9814,93 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_gra
}
define half @global_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 {
+; GFX1250-TRUE16-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX1250-TRUE16: ; %bb.0:
+; GFX1250-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX1250-TRUE16-NEXT: s_mov_b64 s[0:1], lit64(0xfffffffffffff800)
+; GFX1250-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1250-TRUE16-NEXT: v_add_nc_u64_e32 v[4:5], s[0:1], v[0:1]
+; GFX1250-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX1250-TRUE16-NEXT: v_dual_mov_b32 v1, v5 :: v_dual_bitop2_b32 v0, -4, v4 bitop3:0x40
+; GFX1250-TRUE16-NEXT: v_and_b32_e32 v3, 3, v4
+; GFX1250-TRUE16-NEXT: global_load_b32 v5, v[0:1], off
+; GFX1250-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX1250-TRUE16-NEXT: v_not_b32_e32 v4, v4
+; GFX1250-TRUE16-NEXT: .LBB46_1: ; %atomicrmw.start
+; GFX1250-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX1250-TRUE16-NEXT: v_mov_b32_e32 v7, v5
+; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1250-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v7
+; GFX1250-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
+; GFX1250-TRUE16-NEXT: v_add_f16_e32 v5.l, v5.l, v2.l
+; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX1250-TRUE16-NEXT: v_and_or_b32 v6, v7, v4, v5
+; GFX1250-TRUE16-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX1250-TRUE16-NEXT: s_wait_xcnt 0x0
+; GFX1250-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[6:7], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX1250-TRUE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7
+; GFX1250-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX1250-TRUE16-NEXT: s_wait_xcnt 0x0
+; GFX1250-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX1250-TRUE16-NEXT: s_cbranch_execnz .LBB46_1
+; GFX1250-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1250-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX1250-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX1250-TRUE16-NEXT: s_set_pc_i64 s[30:31]
+;
+; GFX1250-FAKE16-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX1250-FAKE16: ; %bb.0:
+; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX1250-FAKE16-NEXT: s_mov_b64 s[0:1], lit64(0xfffffffffffff800)
+; GFX1250-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1250-FAKE16-NEXT: v_add_nc_u64_e32 v[4:5], s[0:1], v[0:1]
+; GFX1250-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX1250-FAKE16-NEXT: v_dual_mov_b32 v1, v5 :: v_dual_bitop2_b32 v0, -4, v4 bitop3:0x40
+; GFX1250-FAKE16-NEXT: v_and_b32_e32 v3, 3, v4
+; GFX1250-FAKE16-NEXT: global_load_b32 v5, v[0:1], off
+; GFX1250-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX1250-FAKE16-NEXT: v_not_b32_e32 v4, v4
+; GFX1250-FAKE16-NEXT: .LBB46_1: ; %atomicrmw.start
+; GFX1250-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX1250-FAKE16-NEXT: v_mov_b32_e32 v7, v5
+; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v7
+; GFX1250-FAKE16-NEXT: v_add_f16_e32 v5, v5, v2
+; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX1250-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-FAKE16-NEXT: v_and_or_b32 v6, v7, v4, v5
+; GFX1250-FAKE16-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX1250-FAKE16-NEXT: s_wait_xcnt 0x0
+; GFX1250-FAKE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[6:7], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX1250-FAKE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7
+; GFX1250-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX1250-FAKE16-NEXT: s_wait_xcnt 0x0
+; GFX1250-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX1250-FAKE16-NEXT: s_cbranch_execnz .LBB46_1
+; GFX1250-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1250-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX1250-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31]
+;
; GFX12-TRUE16-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX12-TRUE16: ; %bb.0:
; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -9549,6 +10338,90 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_gra
}
define void @global_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 {
+; GFX1250-TRUE16-LABEL: global_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory:
+; GFX1250-TRUE16: ; %bb.0:
+; GFX1250-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX1250-TRUE16-NEXT: v_mov_b32_e32 v3, v0
+; GFX1250-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1250-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX1250-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX1250-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX1250-TRUE16-NEXT: global_load_b32 v5, v[0:1], off
+; GFX1250-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-TRUE16-NEXT: v_not_b32_e32 v6, v4
+; GFX1250-TRUE16-NEXT: .LBB47_1: ; %atomicrmw.start
+; GFX1250-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX1250-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v3, v5
+; GFX1250-TRUE16-NEXT: v_mov_b16_e32 v4.h, 0
+; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-TRUE16-NEXT: v_add_f16_e32 v4.l, v4.l, v2.l
+; GFX1250-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v3, v4
+; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-TRUE16-NEXT: v_and_or_b32 v4, v5, v6, v4
+; GFX1250-TRUE16-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX1250-TRUE16-NEXT: s_wait_xcnt 0x0
+; GFX1250-TRUE16-NEXT: global_atomic_cmpswap_b32 v4, v[0:1], v[4:5], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX1250-TRUE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v5
+; GFX1250-TRUE16-NEXT: s_wait_xcnt 0x0
+; GFX1250-TRUE16-NEXT: v_mov_b32_e32 v5, v4
+; GFX1250-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX1250-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX1250-TRUE16-NEXT: s_cbranch_execnz .LBB47_1
+; GFX1250-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1250-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX1250-TRUE16-NEXT: s_set_pc_i64 s[30:31]
+;
+; GFX1250-FAKE16-LABEL: global_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory:
+; GFX1250-FAKE16: ; %bb.0:
+; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX1250-FAKE16-NEXT: v_mov_b32_e32 v3, v0
+; GFX1250-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1250-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX1250-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX1250-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX1250-FAKE16-NEXT: global_load_b32 v5, v[0:1], off
+; GFX1250-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-FAKE16-NEXT: v_not_b32_e32 v6, v4
+; GFX1250-FAKE16-NEXT: .LBB47_1: ; %atomicrmw.start
+; GFX1250-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX1250-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v3, v5
+; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-FAKE16-NEXT: v_add_f16_e32 v4, v4, v2
+; GFX1250-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
+; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v3, v4
+; GFX1250-FAKE16-NEXT: v_and_or_b32 v4, v5, v6, v4
+; GFX1250-FAKE16-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX1250-FAKE16-NEXT: s_wait_xcnt 0x0
+; GFX1250-FAKE16-NEXT: global_atomic_cmpswap_b32 v4, v[0:1], v[4:5], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX1250-FAKE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v5
+; GFX1250-FAKE16-NEXT: s_wait_xcnt 0x0
+; GFX1250-FAKE16-NEXT: v_mov_b32_e32 v5, v4
+; GFX1250-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX1250-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX1250-FAKE16-NEXT: s_cbranch_execnz .LBB47_1
+; GFX1250-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1250-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31]
+;
; GFX12-TRUE16-LABEL: global_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory:
; GFX12-TRUE16: ; %bb.0:
; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -9957,6 +10830,90 @@ define void @global_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory(p
}
define void @global_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 {
+; GFX1250-TRUE16-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX1250-TRUE16: ; %bb.0:
+; GFX1250-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX1250-TRUE16-NEXT: v_add_nc_u64_e32 v[4:5], 0x7fe, v[0:1]
+; GFX1250-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX1250-TRUE16-NEXT: v_dual_mov_b32 v1, v5 :: v_dual_bitop2_b32 v0, -4, v4 bitop3:0x40
+; GFX1250-TRUE16-NEXT: v_and_b32_e32 v3, 3, v4
+; GFX1250-TRUE16-NEXT: global_load_b32 v5, v[0:1], off
+; GFX1250-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX1250-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-TRUE16-NEXT: v_not_b32_e32 v6, v4
+; GFX1250-TRUE16-NEXT: .LBB48_1: ; %atomicrmw.start
+; GFX1250-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX1250-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v3, v5
+; GFX1250-TRUE16-NEXT: v_mov_b16_e32 v4.h, 0
+; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-TRUE16-NEXT: v_add_f16_e32 v4.l, v4.l, v2.l
+; GFX1250-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v3, v4
+; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-TRUE16-NEXT: v_and_or_b32 v4, v5, v6, v4
+; GFX1250-TRUE16-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX1250-TRUE16-NEXT: s_wait_xcnt 0x0
+; GFX1250-TRUE16-NEXT: global_atomic_cmpswap_b32 v4, v[0:1], v[4:5], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX1250-TRUE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v5
+; GFX1250-TRUE16-NEXT: s_wait_xcnt 0x0
+; GFX1250-TRUE16-NEXT: v_mov_b32_e32 v5, v4
+; GFX1250-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX1250-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX1250-TRUE16-NEXT: s_cbranch_execnz .LBB48_1
+; GFX1250-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1250-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX1250-TRUE16-NEXT: s_set_pc_i64 s[30:31]
+;
+; GFX1250-FAKE16-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX1250-FAKE16: ; %bb.0:
+; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX1250-FAKE16-NEXT: v_add_nc_u64_e32 v[4:5], 0x7fe, v[0:1]
+; GFX1250-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX1250-FAKE16-NEXT: v_dual_mov_b32 v1, v5 :: v_dual_bitop2_b32 v0, -4, v4 bitop3:0x40
+; GFX1250-FAKE16-NEXT: v_and_b32_e32 v3, 3, v4
+; GFX1250-FAKE16-NEXT: global_load_b32 v5, v[0:1], off
+; GFX1250-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX1250-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-FAKE16-NEXT: v_not_b32_e32 v6, v4
+; GFX1250-FAKE16-NEXT: .LBB48_1: ; %atomicrmw.start
+; GFX1250-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX1250-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v3, v5
+; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-FAKE16-NEXT: v_add_f16_e32 v4, v4, v2
+; GFX1250-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
+; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v3, v4
+; GFX1250-FAKE16-NEXT: v_and_or_b32 v4, v5, v6, v4
+; GFX1250-FAKE16-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX1250-FAKE16-NEXT: s_wait_xcnt 0x0
+; GFX1250-FAKE16-NEXT: global_atomic_cmpswap_b32 v4, v[0:1], v[4:5], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX1250-FAKE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v5
+; GFX1250-FAKE16-NEXT: s_wait_xcnt 0x0
+; GFX1250-FAKE16-NEXT: v_mov_b32_e32 v5, v4
+; GFX1250-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX1250-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX1250-FAKE16-NEXT: s_cbranch_execnz .LBB48_1
+; GFX1250-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1250-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31]
+;
; GFX12-TRUE16-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX12-TRUE16: ; %bb.0:
; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -10378,6 +11335,92 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_g
}
define void @global_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 {
+; GFX1250-TRUE16-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX1250-TRUE16: ; %bb.0:
+; GFX1250-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX1250-TRUE16-NEXT: s_mov_b64 s[0:1], lit64(0xfffffffffffff800)
+; GFX1250-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1250-TRUE16-NEXT: v_add_nc_u64_e32 v[4:5], s[0:1], v[0:1]
+; GFX1250-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX1250-TRUE16-NEXT: v_dual_mov_b32 v1, v5 :: v_dual_bitop2_b32 v0, -4, v4 bitop3:0x40
+; GFX1250-TRUE16-NEXT: v_and_b32_e32 v3, 3, v4
+; GFX1250-TRUE16-NEXT: global_load_b32 v5, v[0:1], off
+; GFX1250-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX1250-TRUE16-NEXT: v_not_b32_e32 v6, v4
+; GFX1250-TRUE16-NEXT: .LBB49_1: ; %atomicrmw.start
+; GFX1250-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX1250-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v3, v5
+; GFX1250-TRUE16-NEXT: v_mov_b16_e32 v4.h, 0
+; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-TRUE16-NEXT: v_add_f16_e32 v4.l, v4.l, v2.l
+; GFX1250-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v3, v4
+; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-TRUE16-NEXT: v_and_or_b32 v4, v5, v6, v4
+; GFX1250-TRUE16-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX1250-TRUE16-NEXT: s_wait_xcnt 0x0
+; GFX1250-TRUE16-NEXT: global_atomic_cmpswap_b32 v4, v[0:1], v[4:5], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX1250-TRUE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v5
+; GFX1250-TRUE16-NEXT: s_wait_xcnt 0x0
+; GFX1250-TRUE16-NEXT: v_mov_b32_e32 v5, v4
+; GFX1250-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX1250-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX1250-TRUE16-NEXT: s_cbranch_execnz .LBB49_1
+; GFX1250-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1250-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX1250-TRUE16-NEXT: s_set_pc_i64 s[30:31]
+;
+; GFX1250-FAKE16-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX1250-FAKE16: ; %bb.0:
+; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX1250-FAKE16-NEXT: s_mov_b64 s[0:1], lit64(0xfffffffffffff800)
+; GFX1250-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1250-FAKE16-NEXT: v_add_nc_u64_e32 v[4:5], s[0:1], v[0:1]
+; GFX1250-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX1250-FAKE16-NEXT: v_dual_mov_b32 v1, v5 :: v_dual_bitop2_b32 v0, -4, v4 bitop3:0x40
+; GFX1250-FAKE16-NEXT: v_and_b32_e32 v3, 3, v4
+; GFX1250-FAKE16-NEXT: global_load_b32 v5, v[0:1], off
+; GFX1250-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX1250-FAKE16-NEXT: v_not_b32_e32 v6, v4
+; GFX1250-FAKE16-NEXT: .LBB49_1: ; %atomicrmw.start
+; GFX1250-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX1250-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v3, v5
+; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-FAKE16-NEXT: v_add_f16_e32 v4, v4, v2
+; GFX1250-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
+; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v3, v4
+; GFX1250-FAKE16-NEXT: v_and_or_b32 v4, v5, v6, v4
+; GFX1250-FAKE16-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX1250-FAKE16-NEXT: s_wait_xcnt 0x0
+; GFX1250-FAKE16-NEXT: global_atomic_cmpswap_b32 v4, v[0:1], v[4:5], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX1250-FAKE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v5
+; GFX1250-FAKE16-NEXT: s_wait_xcnt 0x0
+; GFX1250-FAKE16-NEXT: v_mov_b32_e32 v5, v4
+; GFX1250-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX1250-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX1250-FAKE16-NEXT: s_cbranch_execnz .LBB49_1
+; GFX1250-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1250-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31]
+;
; GFX12-TRUE16-LABEL: global_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX12-TRUE16: ; %bb.0:
; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -10800,6 +11843,69 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_g
}
define half @global_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 {
+; GFX1250-TRUE16-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
+; GFX1250-TRUE16: ; %bb.0:
+; GFX1250-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX1250-TRUE16-NEXT: global_load_b32 v3, v[0:1], off offset:2046
+; GFX1250-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX1250-TRUE16-NEXT: .LBB50_1: ; %atomicrmw.start
+; GFX1250-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX1250-TRUE16-NEXT: v_mov_b32_e32 v5, v3
+; GFX1250-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0
+; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-TRUE16-NEXT: v_add_f16_e32 v3.l, v5.l, v2.l
+; GFX1250-TRUE16-NEXT: v_and_or_b32 v4, 0xffff0000, v5, v3
+; GFX1250-TRUE16-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX1250-TRUE16-NEXT: s_wait_xcnt 0x0
+; GFX1250-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[4:5], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX1250-TRUE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v5
+; GFX1250-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX1250-TRUE16-NEXT: s_wait_xcnt 0x0
+; GFX1250-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX1250-TRUE16-NEXT: s_cbranch_execnz .LBB50_1
+; GFX1250-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1250-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX1250-TRUE16-NEXT: v_mov_b16_e32 v0.l, v3.l
+; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX1250-TRUE16-NEXT: s_set_pc_i64 s[30:31]
+;
+; GFX1250-FAKE16-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
+; GFX1250-FAKE16: ; %bb.0:
+; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX1250-FAKE16-NEXT: global_load_b32 v3, v[0:1], off offset:2046
+; GFX1250-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX1250-FAKE16-NEXT: .LBB50_1: ; %atomicrmw.start
+; GFX1250-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX1250-FAKE16-NEXT: v_mov_b32_e32 v5, v3
+; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-FAKE16-NEXT: v_add_f16_e32 v3, v5, v2
+; GFX1250-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-FAKE16-NEXT: v_and_or_b32 v4, 0xffff0000, v5, v3
+; GFX1250-FAKE16-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX1250-FAKE16-NEXT: s_wait_xcnt 0x0
+; GFX1250-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[4:5], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX1250-FAKE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v5
+; GFX1250-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX1250-FAKE16-NEXT: s_wait_xcnt 0x0
+; GFX1250-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX1250-FAKE16-NEXT: s_cbranch_execnz .LBB50_1
+; GFX1250-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1250-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX1250-FAKE16-NEXT: v_mov_b32_e32 v0, v3
+; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31]
+;
; GFX12-TRUE16-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
; GFX12-TRUE16: ; %bb.0:
; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -11123,6 +12229,68 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_
}
define void @global_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 {
+; GFX1250-TRUE16-LABEL: global_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
+; GFX1250-TRUE16: ; %bb.0:
+; GFX1250-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX1250-TRUE16-NEXT: global_load_b32 v5, v[0:1], off offset:2046
+; GFX1250-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX1250-TRUE16-NEXT: .LBB51_1: ; %atomicrmw.start
+; GFX1250-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX1250-TRUE16-NEXT: v_add_f16_e32 v3.l, v5.l, v2.l
+; GFX1250-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0
+; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-TRUE16-NEXT: v_and_or_b32 v4, 0xffff0000, v5, v3
+; GFX1250-TRUE16-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX1250-TRUE16-NEXT: s_wait_xcnt 0x0
+; GFX1250-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[4:5], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX1250-TRUE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v5
+; GFX1250-TRUE16-NEXT: s_wait_xcnt 0x0
+; GFX1250-TRUE16-NEXT: v_mov_b32_e32 v5, v3
+; GFX1250-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX1250-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX1250-TRUE16-NEXT: s_cbranch_execnz .LBB51_1
+; GFX1250-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1250-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX1250-TRUE16-NEXT: s_set_pc_i64 s[30:31]
+;
+; GFX1250-FAKE16-LABEL: global_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
+; GFX1250-FAKE16: ; %bb.0:
+; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX1250-FAKE16-NEXT: global_load_b32 v5, v[0:1], off offset:2046
+; GFX1250-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX1250-FAKE16-NEXT: .LBB51_1: ; %atomicrmw.start
+; GFX1250-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX1250-FAKE16-NEXT: v_add_f16_e32 v3, v5, v2
+; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX1250-FAKE16-NEXT: v_and_or_b32 v4, 0xffff0000, v5, v3
+; GFX1250-FAKE16-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX1250-FAKE16-NEXT: s_wait_xcnt 0x0
+; GFX1250-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[4:5], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX1250-FAKE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v5
+; GFX1250-FAKE16-NEXT: s_wait_xcnt 0x0
+; GFX1250-FAKE16-NEXT: v_mov_b32_e32 v5, v3
+; GFX1250-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX1250-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX1250-FAKE16-NEXT: s_cbranch_execnz .LBB51_1
+; GFX1250-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1250-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31]
+;
; GFX12-TRUE16-LABEL: global_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
; GFX12-TRUE16: ; %bb.0:
; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -11434,6 +12602,91 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_n
}
define half @global_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 {
+; GFX1250-TRUE16-LABEL: global_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX1250-TRUE16: ; %bb.0:
+; GFX1250-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX1250-TRUE16-NEXT: v_add_nc_u64_e32 v[4:5], 0x7fe, v[0:1]
+; GFX1250-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX1250-TRUE16-NEXT: v_dual_mov_b32 v1, v5 :: v_dual_bitop2_b32 v0, -4, v4 bitop3:0x40
+; GFX1250-TRUE16-NEXT: v_and_b32_e32 v3, 3, v4
+; GFX1250-TRUE16-NEXT: global_load_b32 v5, v[0:1], off
+; GFX1250-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX1250-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-TRUE16-NEXT: v_not_b32_e32 v4, v4
+; GFX1250-TRUE16-NEXT: .LBB52_1: ; %atomicrmw.start
+; GFX1250-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX1250-TRUE16-NEXT: v_mov_b32_e32 v7, v5
+; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1250-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v7
+; GFX1250-TRUE16-NEXT: v_mov_b16_e32 v5.h, 0
+; GFX1250-TRUE16-NEXT: v_add_f16_e32 v5.l, v5.l, v2.l
+; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX1250-TRUE16-NEXT: v_and_or_b32 v6, v7, v4, v5
+; GFX1250-TRUE16-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX1250-TRUE16-NEXT: s_wait_xcnt 0x0
+; GFX1250-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[6:7], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX1250-TRUE16-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7
+; GFX1250-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX1250-TRUE16-NEXT: s_wait_xcnt 0x0
+; GFX1250-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX1250-TRUE16-NEXT: s_cbranch_execnz .LBB52_1
+; GFX1250-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1250-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX1250-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX1250-TRUE16-NEXT: s_set_pc_i64 s[30:31]
+;
+; GFX1250-FAKE16-LABEL: global_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX1250-FAKE16: ; %bb.0:
+; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX1250-FAKE16-NEXT: v_add_nc_u64_e32 v[4:5], 0x7fe, v[0:1]
+; GFX1250-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX1250-FAKE16-NEXT: v_dual_mov_b32 v1, v5 :: v_dual_bitop2_b32 v0, -4, v4 bitop3:0x40
+; GFX1250-FAKE16-NEXT: v_and_b32_e32 v3, 3, v4
+; GFX1250-FAKE16-NEXT: global_load_b32 v5, v[0:1], off
+; GFX1250-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX1250-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-FAKE16-NEXT: v_not_b32_e32 v4, v4
+; GFX1250-FAKE16-NEXT: .LBB52_1: ; %atomicrmw.start
+; GFX1250-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX1250-FAKE16-NEXT: v_mov_b32_e32 v7, v5
+; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v7
+; GFX1250-FAKE16-NEXT: v_add_f16_e32 v5, v5, v2
+; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX1250-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-FAKE16-NEXT: v_and_or_b32 v6, v7, v4, v5
+; GFX1250-FAKE16-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX1250-FAKE16-NEXT: s_wait_xcnt 0x0
+; GFX1250-FAKE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[6:7], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX1250-FAKE16-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7
+; GFX1250-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX1250-FAKE16-NEXT: s_wait_xcnt 0x0
+; GFX1250-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX1250-FAKE16-NEXT: s_cbranch_execnz .LBB52_1
+; GFX1250-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1250-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX1250-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31]
+;
; GFX12-TRUE16-LABEL: global_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX12-TRUE16: ; %bb.0:
; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -11874,6 +13127,90 @@ define half @global_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_gr
}
define void @global_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, half %val) #0 {
+; GFX1250-TRUE16-LABEL: global_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX1250-TRUE16: ; %bb.0:
+; GFX1250-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX1250-TRUE16-NEXT: v_add_nc_u64_e32 v[4:5], 0x7fe, v[0:1]
+; GFX1250-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX1250-TRUE16-NEXT: v_dual_mov_b32 v1, v5 :: v_dual_bitop2_b32 v0, -4, v4 bitop3:0x40
+; GFX1250-TRUE16-NEXT: v_and_b32_e32 v3, 3, v4
+; GFX1250-TRUE16-NEXT: global_load_b32 v5, v[0:1], off
+; GFX1250-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX1250-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-TRUE16-NEXT: v_not_b32_e32 v6, v4
+; GFX1250-TRUE16-NEXT: .LBB53_1: ; %atomicrmw.start
+; GFX1250-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX1250-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v3, v5
+; GFX1250-TRUE16-NEXT: v_mov_b16_e32 v4.h, 0
+; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-TRUE16-NEXT: v_add_f16_e32 v4.l, v4.l, v2.l
+; GFX1250-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v3, v4
+; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-TRUE16-NEXT: v_and_or_b32 v4, v5, v6, v4
+; GFX1250-TRUE16-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX1250-TRUE16-NEXT: s_wait_xcnt 0x0
+; GFX1250-TRUE16-NEXT: global_atomic_cmpswap_b32 v4, v[0:1], v[4:5], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX1250-TRUE16-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v5
+; GFX1250-TRUE16-NEXT: s_wait_xcnt 0x0
+; GFX1250-TRUE16-NEXT: v_mov_b32_e32 v5, v4
+; GFX1250-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX1250-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX1250-TRUE16-NEXT: s_cbranch_execnz .LBB53_1
+; GFX1250-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1250-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX1250-TRUE16-NEXT: s_set_pc_i64 s[30:31]
+;
+; GFX1250-FAKE16-LABEL: global_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX1250-FAKE16: ; %bb.0:
+; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX1250-FAKE16-NEXT: v_add_nc_u64_e32 v[4:5], 0x7fe, v[0:1]
+; GFX1250-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX1250-FAKE16-NEXT: v_dual_mov_b32 v1, v5 :: v_dual_bitop2_b32 v0, -4, v4 bitop3:0x40
+; GFX1250-FAKE16-NEXT: v_and_b32_e32 v3, 3, v4
+; GFX1250-FAKE16-NEXT: global_load_b32 v5, v[0:1], off
+; GFX1250-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX1250-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-FAKE16-NEXT: v_not_b32_e32 v6, v4
+; GFX1250-FAKE16-NEXT: .LBB53_1: ; %atomicrmw.start
+; GFX1250-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX1250-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v3, v5
+; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-FAKE16-NEXT: v_add_f16_e32 v4, v4, v2
+; GFX1250-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
+; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v3, v4
+; GFX1250-FAKE16-NEXT: v_and_or_b32 v4, v5, v6, v4
+; GFX1250-FAKE16-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX1250-FAKE16-NEXT: s_wait_xcnt 0x0
+; GFX1250-FAKE16-NEXT: global_atomic_cmpswap_b32 v4, v[0:1], v[4:5], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX1250-FAKE16-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v5
+; GFX1250-FAKE16-NEXT: s_wait_xcnt 0x0
+; GFX1250-FAKE16-NEXT: v_mov_b32_e32 v5, v4
+; GFX1250-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX1250-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX1250-FAKE16-NEXT: s_cbranch_execnz .LBB53_1
+; GFX1250-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1250-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31]
+;
; GFX12-TRUE16-LABEL: global_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX12-TRUE16: ; %bb.0:
; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -12303,6 +13640,101 @@ define void @global_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_
; --------------------------------------------------------------------
define bfloat @global_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 {
+; GFX1250-TRUE16-LABEL: global_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory:
+; GFX1250-TRUE16: ; %bb.0:
+; GFX1250-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX1250-TRUE16-NEXT: v_mov_b32_e32 v3, v0
+; GFX1250-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1250-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX1250-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX1250-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX1250-TRUE16-NEXT: global_load_b32 v5, v[0:1], off
+; GFX1250-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-TRUE16-NEXT: v_not_b32_e32 v4, v4
+; GFX1250-TRUE16-NEXT: .LBB54_1: ; %atomicrmw.start
+; GFX1250-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX1250-TRUE16-NEXT: v_mov_b32_e32 v7, v5
+; GFX1250-TRUE16-NEXT: v_mov_b16_e32 v6.l, 0
+; GFX1250-TRUE16-NEXT: v_mov_b16_e32 v6.h, v2.l
+; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v7
+; GFX1250-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1250-TRUE16-NEXT: v_add_f32_e32 v5, v5, v6
+; GFX1250-TRUE16-NEXT: v_mov_b16_e32 v6.h, v6.l
+; GFX1250-TRUE16-NEXT: v_cvt_pk_bf16_f32 v5, v5, s0
+; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-TRUE16-NEXT: v_mov_b16_e32 v6.l, v5.l
+; GFX1250-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v6
+; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-TRUE16-NEXT: v_and_or_b32 v6, v7, v4, v5
+; GFX1250-TRUE16-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX1250-TRUE16-NEXT: s_wait_xcnt 0x0
+; GFX1250-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[6:7], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX1250-TRUE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7
+; GFX1250-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX1250-TRUE16-NEXT: s_wait_xcnt 0x0
+; GFX1250-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX1250-TRUE16-NEXT: s_cbranch_execnz .LBB54_1
+; GFX1250-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1250-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX1250-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX1250-TRUE16-NEXT: s_set_pc_i64 s[30:31]
+;
+; GFX1250-FAKE16-LABEL: global_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory:
+; GFX1250-FAKE16: ; %bb.0:
+; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX1250-FAKE16-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2
+; GFX1250-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1250-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX1250-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX1250-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX1250-FAKE16-NEXT: global_load_b32 v5, v[0:1], off
+; GFX1250-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-FAKE16-NEXT: v_not_b32_e32 v4, v4
+; GFX1250-FAKE16-NEXT: .LBB54_1: ; %atomicrmw.start
+; GFX1250-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX1250-FAKE16-NEXT: v_mov_b32_e32 v7, v5
+; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v7
+; GFX1250-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-FAKE16-NEXT: v_add_f32_e32 v5, v5, v2
+; GFX1250-FAKE16-NEXT: v_cvt_pk_bf16_f32 v5, v5, s0
+; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX1250-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-FAKE16-NEXT: v_and_or_b32 v6, v7, v4, v5
+; GFX1250-FAKE16-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX1250-FAKE16-NEXT: s_wait_xcnt 0x0
+; GFX1250-FAKE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[6:7], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX1250-FAKE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7
+; GFX1250-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX1250-FAKE16-NEXT: s_wait_xcnt 0x0
+; GFX1250-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX1250-FAKE16-NEXT: s_cbranch_execnz .LBB54_1
+; GFX1250-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1250-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX1250-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31]
+;
; GFX12-TRUE16-LABEL: global_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory:
; GFX12-TRUE16: ; %bb.0:
; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -12805,6 +14237,101 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory(
}
define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 {
+; GFX1250-TRUE16-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX1250-TRUE16: ; %bb.0:
+; GFX1250-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX1250-TRUE16-NEXT: v_add_nc_u64_e32 v[4:5], 0x7fe, v[0:1]
+; GFX1250-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX1250-TRUE16-NEXT: v_dual_mov_b32 v1, v5 :: v_dual_bitop2_b32 v0, -4, v4 bitop3:0x40
+; GFX1250-TRUE16-NEXT: v_and_b32_e32 v3, 3, v4
+; GFX1250-TRUE16-NEXT: global_load_b32 v5, v[0:1], off
+; GFX1250-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX1250-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-TRUE16-NEXT: v_not_b32_e32 v4, v4
+; GFX1250-TRUE16-NEXT: .LBB55_1: ; %atomicrmw.start
+; GFX1250-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX1250-TRUE16-NEXT: v_mov_b32_e32 v7, v5
+; GFX1250-TRUE16-NEXT: v_mov_b16_e32 v6.l, 0
+; GFX1250-TRUE16-NEXT: v_mov_b16_e32 v6.h, v2.l
+; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v7
+; GFX1250-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1250-TRUE16-NEXT: v_add_f32_e32 v5, v5, v6
+; GFX1250-TRUE16-NEXT: v_mov_b16_e32 v6.h, v6.l
+; GFX1250-TRUE16-NEXT: v_cvt_pk_bf16_f32 v5, v5, s0
+; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-TRUE16-NEXT: v_mov_b16_e32 v6.l, v5.l
+; GFX1250-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v6
+; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-TRUE16-NEXT: v_and_or_b32 v6, v7, v4, v5
+; GFX1250-TRUE16-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX1250-TRUE16-NEXT: s_wait_xcnt 0x0
+; GFX1250-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[6:7], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX1250-TRUE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7
+; GFX1250-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX1250-TRUE16-NEXT: s_wait_xcnt 0x0
+; GFX1250-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX1250-TRUE16-NEXT: s_cbranch_execnz .LBB55_1
+; GFX1250-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1250-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX1250-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX1250-TRUE16-NEXT: s_set_pc_i64 s[30:31]
+;
+; GFX1250-FAKE16-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX1250-FAKE16: ; %bb.0:
+; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX1250-FAKE16-NEXT: v_add_nc_u64_e32 v[4:5], 0x7fe, v[0:1]
+; GFX1250-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1250-FAKE16-NEXT: v_dual_lshlrev_b32 v2, 16, v2 :: v_dual_bitop2_b32 v3, 3, v4 bitop3:0x40
+; GFX1250-FAKE16-NEXT: v_dual_mov_b32 v1, v5 :: v_dual_bitop2_b32 v0, -4, v4 bitop3:0x40
+; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX1250-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX1250-FAKE16-NEXT: global_load_b32 v5, v[0:1], off
+; GFX1250-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX1250-FAKE16-NEXT: v_not_b32_e32 v4, v4
+; GFX1250-FAKE16-NEXT: .LBB55_1: ; %atomicrmw.start
+; GFX1250-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX1250-FAKE16-NEXT: v_mov_b32_e32 v7, v5
+; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v7
+; GFX1250-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-FAKE16-NEXT: v_add_f32_e32 v5, v5, v2
+; GFX1250-FAKE16-NEXT: v_cvt_pk_bf16_f32 v5, v5, s0
+; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX1250-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-FAKE16-NEXT: v_and_or_b32 v6, v7, v4, v5
+; GFX1250-FAKE16-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX1250-FAKE16-NEXT: s_wait_xcnt 0x0
+; GFX1250-FAKE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[6:7], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX1250-FAKE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7
+; GFX1250-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX1250-FAKE16-NEXT: s_wait_xcnt 0x0
+; GFX1250-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX1250-FAKE16-NEXT: s_cbranch_execnz .LBB55_1
+; GFX1250-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1250-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX1250-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31]
+;
; GFX12-TRUE16-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX12-TRUE16: ; %bb.0:
; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -13324,6 +14851,104 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_
}
define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 {
+; GFX1250-TRUE16-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX1250-TRUE16: ; %bb.0:
+; GFX1250-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX1250-TRUE16-NEXT: s_mov_b64 s[0:1], lit64(0xfffffffffffff800)
+; GFX1250-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1250-TRUE16-NEXT: v_add_nc_u64_e32 v[4:5], s[0:1], v[0:1]
+; GFX1250-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX1250-TRUE16-NEXT: v_dual_mov_b32 v1, v5 :: v_dual_bitop2_b32 v0, -4, v4 bitop3:0x40
+; GFX1250-TRUE16-NEXT: v_and_b32_e32 v3, 3, v4
+; GFX1250-TRUE16-NEXT: global_load_b32 v5, v[0:1], off
+; GFX1250-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX1250-TRUE16-NEXT: v_not_b32_e32 v4, v4
+; GFX1250-TRUE16-NEXT: .LBB56_1: ; %atomicrmw.start
+; GFX1250-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX1250-TRUE16-NEXT: v_mov_b32_e32 v7, v5
+; GFX1250-TRUE16-NEXT: v_mov_b16_e32 v6.l, 0
+; GFX1250-TRUE16-NEXT: v_mov_b16_e32 v6.h, v2.l
+; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v7
+; GFX1250-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1250-TRUE16-NEXT: v_add_f32_e32 v5, v5, v6
+; GFX1250-TRUE16-NEXT: v_mov_b16_e32 v6.h, v6.l
+; GFX1250-TRUE16-NEXT: v_cvt_pk_bf16_f32 v5, v5, s0
+; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-TRUE16-NEXT: v_mov_b16_e32 v6.l, v5.l
+; GFX1250-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v6
+; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-TRUE16-NEXT: v_and_or_b32 v6, v7, v4, v5
+; GFX1250-TRUE16-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX1250-TRUE16-NEXT: s_wait_xcnt 0x0
+; GFX1250-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[6:7], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX1250-TRUE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7
+; GFX1250-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX1250-TRUE16-NEXT: s_wait_xcnt 0x0
+; GFX1250-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX1250-TRUE16-NEXT: s_cbranch_execnz .LBB56_1
+; GFX1250-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1250-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX1250-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX1250-TRUE16-NEXT: s_set_pc_i64 s[30:31]
+;
+; GFX1250-FAKE16-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX1250-FAKE16: ; %bb.0:
+; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX1250-FAKE16-NEXT: s_mov_b64 s[0:1], lit64(0xfffffffffffff800)
+; GFX1250-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX1250-FAKE16-NEXT: v_add_nc_u64_e32 v[4:5], s[0:1], v[0:1]
+; GFX1250-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX1250-FAKE16-NEXT: v_dual_mov_b32 v1, v5 :: v_dual_bitop2_b32 v0, -4, v4 bitop3:0x40
+; GFX1250-FAKE16-NEXT: v_and_b32_e32 v3, 3, v4
+; GFX1250-FAKE16-NEXT: global_load_b32 v5, v[0:1], off
+; GFX1250-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX1250-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-FAKE16-NEXT: v_not_b32_e32 v4, v4
+; GFX1250-FAKE16-NEXT: .LBB56_1: ; %atomicrmw.start
+; GFX1250-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX1250-FAKE16-NEXT: v_mov_b32_e32 v7, v5
+; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v7
+; GFX1250-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-FAKE16-NEXT: v_add_f32_e32 v5, v5, v2
+; GFX1250-FAKE16-NEXT: v_cvt_pk_bf16_f32 v5, v5, s0
+; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX1250-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-FAKE16-NEXT: v_and_or_b32 v6, v7, v4, v5
+; GFX1250-FAKE16-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX1250-FAKE16-NEXT: s_wait_xcnt 0x0
+; GFX1250-FAKE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[6:7], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX1250-FAKE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7
+; GFX1250-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX1250-FAKE16-NEXT: s_wait_xcnt 0x0
+; GFX1250-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX1250-FAKE16-NEXT: s_cbranch_execnz .LBB56_1
+; GFX1250-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1250-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX1250-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31]
+;
; GFX12-TRUE16-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX12-TRUE16: ; %bb.0:
; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -13844,6 +15469,99 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_
}
define void @global_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 {
+; GFX1250-TRUE16-LABEL: global_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory:
+; GFX1250-TRUE16: ; %bb.0:
+; GFX1250-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX1250-TRUE16-NEXT: v_mov_b32_e32 v3, v0
+; GFX1250-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1250-TRUE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX1250-TRUE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX1250-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX1250-TRUE16-NEXT: global_load_b32 v5, v[0:1], off
+; GFX1250-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-TRUE16-NEXT: v_not_b32_e32 v6, v4
+; GFX1250-TRUE16-NEXT: .LBB57_1: ; %atomicrmw.start
+; GFX1250-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX1250-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v3, v5
+; GFX1250-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0
+; GFX1250-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l
+; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX1250-TRUE16-NEXT: v_add_f32_e32 v4, v4, v7
+; GFX1250-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l
+; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-TRUE16-NEXT: v_cvt_pk_bf16_f32 v4, v4, s0
+; GFX1250-TRUE16-NEXT: v_mov_b16_e32 v7.l, v4.l
+; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v3, v7
+; GFX1250-TRUE16-NEXT: v_and_or_b32 v4, v5, v6, v4
+; GFX1250-TRUE16-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX1250-TRUE16-NEXT: s_wait_xcnt 0x0
+; GFX1250-TRUE16-NEXT: global_atomic_cmpswap_b32 v4, v[0:1], v[4:5], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX1250-TRUE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v5
+; GFX1250-TRUE16-NEXT: s_wait_xcnt 0x0
+; GFX1250-TRUE16-NEXT: v_mov_b32_e32 v5, v4
+; GFX1250-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX1250-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX1250-TRUE16-NEXT: s_cbranch_execnz .LBB57_1
+; GFX1250-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1250-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX1250-TRUE16-NEXT: s_set_pc_i64 s[30:31]
+;
+; GFX1250-FAKE16-LABEL: global_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory:
+; GFX1250-FAKE16: ; %bb.0:
+; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX1250-FAKE16-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_lshlrev_b32 v2, 16, v2
+; GFX1250-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1250-FAKE16-NEXT: v_and_b32_e32 v0, -4, v3
+; GFX1250-FAKE16-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX1250-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX1250-FAKE16-NEXT: global_load_b32 v5, v[0:1], off
+; GFX1250-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-FAKE16-NEXT: v_not_b32_e32 v6, v4
+; GFX1250-FAKE16-NEXT: .LBB57_1: ; %atomicrmw.start
+; GFX1250-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX1250-FAKE16-NEXT: v_lshrrev_b32_e32 v4, v3, v5
+; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX1250-FAKE16-NEXT: v_add_f32_e32 v4, v4, v2
+; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-FAKE16-NEXT: v_cvt_pk_bf16_f32 v4, v4, s0
+; GFX1250-FAKE16-NEXT: v_and_b32_e32 v4, 0xffff, v4
+; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-FAKE16-NEXT: v_lshlrev_b32_e32 v4, v3, v4
+; GFX1250-FAKE16-NEXT: v_and_or_b32 v4, v5, v6, v4
+; GFX1250-FAKE16-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX1250-FAKE16-NEXT: s_wait_xcnt 0x0
+; GFX1250-FAKE16-NEXT: global_atomic_cmpswap_b32 v4, v[0:1], v[4:5], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX1250-FAKE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v5
+; GFX1250-FAKE16-NEXT: s_wait_xcnt 0x0
+; GFX1250-FAKE16-NEXT: v_mov_b32_e32 v5, v4
+; GFX1250-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX1250-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX1250-FAKE16-NEXT: s_cbranch_execnz .LBB57_1
+; GFX1250-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1250-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31]
+;
; GFX12-TRUE16-LABEL: global_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory:
; GFX12-TRUE16: ; %bb.0:
; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -14330,6 +16048,99 @@ define void @global_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory(
}
define void @global_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 {
+; GFX1250-TRUE16-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX1250-TRUE16: ; %bb.0:
+; GFX1250-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX1250-TRUE16-NEXT: v_add_nc_u64_e32 v[4:5], 0x7fe, v[0:1]
+; GFX1250-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX1250-TRUE16-NEXT: v_dual_mov_b32 v1, v5 :: v_dual_bitop2_b32 v0, -4, v4 bitop3:0x40
+; GFX1250-TRUE16-NEXT: v_and_b32_e32 v3, 3, v4
+; GFX1250-TRUE16-NEXT: global_load_b32 v5, v[0:1], off
+; GFX1250-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX1250-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-TRUE16-NEXT: v_not_b32_e32 v6, v4
+; GFX1250-TRUE16-NEXT: .LBB58_1: ; %atomicrmw.start
+; GFX1250-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX1250-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v3, v5
+; GFX1250-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0
+; GFX1250-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l
+; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX1250-TRUE16-NEXT: v_add_f32_e32 v4, v4, v7
+; GFX1250-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l
+; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-TRUE16-NEXT: v_cvt_pk_bf16_f32 v4, v4, s0
+; GFX1250-TRUE16-NEXT: v_mov_b16_e32 v7.l, v4.l
+; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v3, v7
+; GFX1250-TRUE16-NEXT: v_and_or_b32 v4, v5, v6, v4
+; GFX1250-TRUE16-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX1250-TRUE16-NEXT: s_wait_xcnt 0x0
+; GFX1250-TRUE16-NEXT: global_atomic_cmpswap_b32 v4, v[0:1], v[4:5], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX1250-TRUE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v5
+; GFX1250-TRUE16-NEXT: s_wait_xcnt 0x0
+; GFX1250-TRUE16-NEXT: v_mov_b32_e32 v5, v4
+; GFX1250-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX1250-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX1250-TRUE16-NEXT: s_cbranch_execnz .LBB58_1
+; GFX1250-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1250-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX1250-TRUE16-NEXT: s_set_pc_i64 s[30:31]
+;
+; GFX1250-FAKE16-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX1250-FAKE16: ; %bb.0:
+; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX1250-FAKE16-NEXT: v_add_nc_u64_e32 v[4:5], 0x7fe, v[0:1]
+; GFX1250-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1250-FAKE16-NEXT: v_dual_lshlrev_b32 v6, 16, v2 :: v_dual_bitop2_b32 v0, -4, v4 bitop3:0x40
+; GFX1250-FAKE16-NEXT: v_dual_mov_b32 v1, v5 :: v_dual_bitop2_b32 v4, 3, v4 bitop3:0x40
+; GFX1250-FAKE16-NEXT: global_load_b32 v3, v[0:1], off
+; GFX1250-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
+; GFX1250-FAKE16-NEXT: v_not_b32_e32 v5, v5
+; GFX1250-FAKE16-NEXT: .LBB58_1: ; %atomicrmw.start
+; GFX1250-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX1250-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX1250-FAKE16-NEXT: v_add_f32_e32 v2, v2, v6
+; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-FAKE16-NEXT: v_cvt_pk_bf16_f32 v2, v2, s0
+; GFX1250-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-FAKE16-NEXT: v_lshlrev_b32_e32 v2, v4, v2
+; GFX1250-FAKE16-NEXT: v_and_or_b32 v2, v3, v5, v2
+; GFX1250-FAKE16-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX1250-FAKE16-NEXT: s_wait_xcnt 0x0
+; GFX1250-FAKE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX1250-FAKE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX1250-FAKE16-NEXT: s_wait_xcnt 0x0
+; GFX1250-FAKE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX1250-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX1250-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX1250-FAKE16-NEXT: s_cbranch_execnz .LBB58_1
+; GFX1250-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1250-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31]
+;
; GFX12-TRUE16-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX12-TRUE16: ; %bb.0:
; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -14832,6 +16643,103 @@ define void @global_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_
}
define void @global_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 {
+;
+; GFX1250-TRUE16-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX1250-TRUE16: ; %bb.0:
+; GFX1250-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX1250-TRUE16-NEXT: s_mov_b64 s[0:1], lit64(0xfffffffffffff800)
+; GFX1250-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1250-TRUE16-NEXT: v_add_nc_u64_e32 v[4:5], s[0:1], v[0:1]
+; GFX1250-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX1250-TRUE16-NEXT: v_dual_mov_b32 v1, v5 :: v_dual_bitop2_b32 v0, -4, v4 bitop3:0x40
+; GFX1250-TRUE16-NEXT: v_and_b32_e32 v3, 3, v4
+; GFX1250-TRUE16-NEXT: global_load_b32 v5, v[0:1], off
+; GFX1250-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX1250-TRUE16-NEXT: v_not_b32_e32 v6, v4
+; GFX1250-TRUE16-NEXT: .LBB59_1: ; %atomicrmw.start
+; GFX1250-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX1250-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v3, v5
+; GFX1250-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0
+; GFX1250-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l
+; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX1250-TRUE16-NEXT: v_add_f32_e32 v4, v4, v7
+; GFX1250-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l
+; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-TRUE16-NEXT: v_cvt_pk_bf16_f32 v4, v4, s0
+; GFX1250-TRUE16-NEXT: v_mov_b16_e32 v7.l, v4.l
+; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v3, v7
+; GFX1250-TRUE16-NEXT: v_and_or_b32 v4, v5, v6, v4
+; GFX1250-TRUE16-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX1250-TRUE16-NEXT: s_wait_xcnt 0x0
+; GFX1250-TRUE16-NEXT: global_atomic_cmpswap_b32 v4, v[0:1], v[4:5], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX1250-TRUE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v5
+; GFX1250-TRUE16-NEXT: s_wait_xcnt 0x0
+; GFX1250-TRUE16-NEXT: v_mov_b32_e32 v5, v4
+; GFX1250-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX1250-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX1250-TRUE16-NEXT: s_cbranch_execnz .LBB59_1
+; GFX1250-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1250-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX1250-TRUE16-NEXT: s_set_pc_i64 s[30:31]
+;
+; GFX1250-FAKE16-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX1250-FAKE16: ; %bb.0:
+; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX1250-FAKE16-NEXT: s_mov_b64 s[0:1], lit64(0xfffffffffffff800)
+; GFX1250-FAKE16-NEXT: v_lshlrev_b32_e32 v6, 16, v2
+; GFX1250-FAKE16-NEXT: v_add_nc_u64_e32 v[4:5], s[0:1], v[0:1]
+; GFX1250-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX1250-FAKE16-NEXT: v_dual_mov_b32 v1, v5 :: v_dual_bitop2_b32 v0, -4, v4 bitop3:0x40
+; GFX1250-FAKE16-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX1250-FAKE16-NEXT: global_load_b32 v3, v[0:1], off
+; GFX1250-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX1250-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
+; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-FAKE16-NEXT: v_not_b32_e32 v5, v5
+; GFX1250-FAKE16-NEXT: .LBB59_1: ; %atomicrmw.start
+; GFX1250-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX1250-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX1250-FAKE16-NEXT: v_add_f32_e32 v2, v2, v6
+; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-FAKE16-NEXT: v_cvt_pk_bf16_f32 v2, v2, s0
+; GFX1250-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-FAKE16-NEXT: v_lshlrev_b32_e32 v2, v4, v2
+; GFX1250-FAKE16-NEXT: v_and_or_b32 v2, v3, v5, v2
+; GFX1250-FAKE16-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX1250-FAKE16-NEXT: s_wait_xcnt 0x0
+; GFX1250-FAKE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX1250-FAKE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX1250-FAKE16-NEXT: s_wait_xcnt 0x0
+; GFX1250-FAKE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX1250-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX1250-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX1250-FAKE16-NEXT: s_cbranch_execnz .LBB59_1
+; GFX1250-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1250-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31]
+;
; GFX12-TRUE16-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX12-TRUE16: ; %bb.0:
; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -15335,6 +17243,81 @@ define void @global_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_
}
define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 {
+;
+; GFX1250-TRUE16-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
+; GFX1250-TRUE16: ; %bb.0:
+; GFX1250-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX1250-TRUE16-NEXT: global_load_b32 v3, v[0:1], off offset:2046
+; GFX1250-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX1250-TRUE16-NEXT: .LBB60_1: ; %atomicrmw.start
+; GFX1250-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX1250-TRUE16-NEXT: v_mov_b32_e32 v5, v3
+; GFX1250-TRUE16-NEXT: v_mov_b16_e32 v3.l, 0
+; GFX1250-TRUE16-NEXT: v_mov_b16_e32 v3.h, v2.l
+; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v5
+; GFX1250-TRUE16-NEXT: v_add_f32_e32 v4, v4, v3
+; GFX1250-TRUE16-NEXT: v_mov_b16_e32 v3.h, v3.l
+; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-TRUE16-NEXT: v_cvt_pk_bf16_f32 v4, v4, s0
+; GFX1250-TRUE16-NEXT: v_mov_b16_e32 v3.l, v4.l
+; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-TRUE16-NEXT: v_and_or_b32 v4, 0xffff0000, v5, v3
+; GFX1250-TRUE16-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX1250-TRUE16-NEXT: s_wait_xcnt 0x0
+; GFX1250-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[4:5], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX1250-TRUE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v5
+; GFX1250-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX1250-TRUE16-NEXT: s_wait_xcnt 0x0
+; GFX1250-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX1250-TRUE16-NEXT: s_cbranch_execnz .LBB60_1
+; GFX1250-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1250-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX1250-TRUE16-NEXT: v_mov_b16_e32 v0.l, v3.l
+; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX1250-TRUE16-NEXT: s_set_pc_i64 s[30:31]
+;
+; GFX1250-FAKE16-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
+; GFX1250-FAKE16: ; %bb.0:
+; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX1250-FAKE16-NEXT: global_load_b32 v3, v[0:1], off offset:2046
+; GFX1250-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX1250-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX1250-FAKE16-NEXT: .LBB60_1: ; %atomicrmw.start
+; GFX1250-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX1250-FAKE16-NEXT: v_mov_b32_e32 v5, v3
+; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v5
+; GFX1250-FAKE16-NEXT: v_add_f32_e32 v3, v3, v2
+; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-FAKE16-NEXT: v_cvt_pk_bf16_f32 v3, v3, s0
+; GFX1250-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-FAKE16-NEXT: v_and_or_b32 v4, 0xffff0000, v5, v3
+; GFX1250-FAKE16-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX1250-FAKE16-NEXT: s_wait_xcnt 0x0
+; GFX1250-FAKE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[4:5], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX1250-FAKE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v5
+; GFX1250-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX1250-FAKE16-NEXT: s_wait_xcnt 0x0
+; GFX1250-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX1250-FAKE16-NEXT: s_cbranch_execnz .LBB60_1
+; GFX1250-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1250-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX1250-FAKE16-NEXT: v_mov_b32_e32 v0, v3
+; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31]
+;
; GFX12-TRUE16-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
; GFX12-TRUE16: ; %bb.0:
; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -15747,6 +17730,79 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_
}
define void @global_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 {
+;
+; GFX1250-TRUE16-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
+; GFX1250-TRUE16: ; %bb.0:
+; GFX1250-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX1250-TRUE16-NEXT: global_load_b32 v5, v[0:1], off offset:2046
+; GFX1250-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX1250-TRUE16-NEXT: .LBB61_1: ; %atomicrmw.start
+; GFX1250-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1250-TRUE16-NEXT: v_mov_b16_e32 v3.l, 0
+; GFX1250-TRUE16-NEXT: v_mov_b16_e32 v3.h, v2.l
+; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX1250-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v5
+; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1250-TRUE16-NEXT: v_add_f32_e32 v4, v4, v3
+; GFX1250-TRUE16-NEXT: v_mov_b16_e32 v3.h, v3.l
+; GFX1250-TRUE16-NEXT: v_cvt_pk_bf16_f32 v4, v4, s0
+; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-TRUE16-NEXT: v_mov_b16_e32 v3.l, v4.l
+; GFX1250-TRUE16-NEXT: v_and_or_b32 v4, 0xffff0000, v5, v3
+; GFX1250-TRUE16-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX1250-TRUE16-NEXT: s_wait_xcnt 0x0
+; GFX1250-TRUE16-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[4:5], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX1250-TRUE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v5
+; GFX1250-TRUE16-NEXT: s_wait_xcnt 0x0
+; GFX1250-TRUE16-NEXT: v_mov_b32_e32 v5, v3
+; GFX1250-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX1250-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX1250-TRUE16-NEXT: s_cbranch_execnz .LBB61_1
+; GFX1250-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1250-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX1250-TRUE16-NEXT: s_set_pc_i64 s[30:31]
+;
+; GFX1250-FAKE16-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
+; GFX1250-FAKE16: ; %bb.0:
+; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX1250-FAKE16-NEXT: global_load_b32 v3, v[0:1], off offset:2046
+; GFX1250-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX1250-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX1250-FAKE16-NEXT: .LBB61_1: ; %atomicrmw.start
+; GFX1250-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX1250-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-FAKE16-NEXT: v_add_f32_e32 v2, v2, v4
+; GFX1250-FAKE16-NEXT: v_cvt_pk_bf16_f32 v2, v2, s0
+; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX1250-FAKE16-NEXT: v_and_or_b32 v2, 0xffff0000, v3, v2
+; GFX1250-FAKE16-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX1250-FAKE16-NEXT: s_wait_xcnt 0x0
+; GFX1250-FAKE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:2046 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX1250-FAKE16-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX1250-FAKE16-NEXT: s_wait_xcnt 0x0
+; GFX1250-FAKE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX1250-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX1250-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX1250-FAKE16-NEXT: s_cbranch_execnz .LBB61_1
+; GFX1250-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1250-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31]
+;
; GFX12-TRUE16-LABEL: global_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_no_fine_grained_memory:
; GFX12-TRUE16: ; %bb.0:
; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -16145,6 +18201,102 @@ define void @global_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_
}
define bfloat @global_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 {
+;
+; GFX1250-TRUE16-LABEL: global_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX1250-TRUE16: ; %bb.0:
+; GFX1250-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX1250-TRUE16-NEXT: v_add_nc_u64_e32 v[4:5], 0x7fe, v[0:1]
+; GFX1250-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX1250-TRUE16-NEXT: v_dual_mov_b32 v1, v5 :: v_dual_bitop2_b32 v0, -4, v4 bitop3:0x40
+; GFX1250-TRUE16-NEXT: v_and_b32_e32 v3, 3, v4
+; GFX1250-TRUE16-NEXT: global_load_b32 v5, v[0:1], off
+; GFX1250-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX1250-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-TRUE16-NEXT: v_not_b32_e32 v4, v4
+; GFX1250-TRUE16-NEXT: .LBB62_1: ; %atomicrmw.start
+; GFX1250-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX1250-TRUE16-NEXT: v_mov_b32_e32 v7, v5
+; GFX1250-TRUE16-NEXT: v_mov_b16_e32 v6.l, 0
+; GFX1250-TRUE16-NEXT: v_mov_b16_e32 v6.h, v2.l
+; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-TRUE16-NEXT: v_lshrrev_b32_e32 v5, v3, v7
+; GFX1250-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1250-TRUE16-NEXT: v_add_f32_e32 v5, v5, v6
+; GFX1250-TRUE16-NEXT: v_mov_b16_e32 v6.h, v6.l
+; GFX1250-TRUE16-NEXT: v_cvt_pk_bf16_f32 v5, v5, s0
+; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-TRUE16-NEXT: v_mov_b16_e32 v6.l, v5.l
+; GFX1250-TRUE16-NEXT: v_lshlrev_b32_e32 v5, v3, v6
+; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-TRUE16-NEXT: v_and_or_b32 v6, v7, v4, v5
+; GFX1250-TRUE16-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX1250-TRUE16-NEXT: s_wait_xcnt 0x0
+; GFX1250-TRUE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[6:7], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX1250-TRUE16-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7
+; GFX1250-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX1250-TRUE16-NEXT: s_wait_xcnt 0x0
+; GFX1250-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX1250-TRUE16-NEXT: s_cbranch_execnz .LBB62_1
+; GFX1250-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1250-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX1250-TRUE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX1250-TRUE16-NEXT: s_set_pc_i64 s[30:31]
+;
+; GFX1250-FAKE16-LABEL: global_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX1250-FAKE16: ; %bb.0:
+; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX1250-FAKE16-NEXT: v_add_nc_u64_e32 v[4:5], 0x7fe, v[0:1]
+; GFX1250-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1250-FAKE16-NEXT: v_dual_lshlrev_b32 v2, 16, v2 :: v_dual_bitop2_b32 v3, 3, v4 bitop3:0x40
+; GFX1250-FAKE16-NEXT: v_dual_mov_b32 v1, v5 :: v_dual_bitop2_b32 v0, -4, v4 bitop3:0x40
+; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX1250-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX1250-FAKE16-NEXT: global_load_b32 v5, v[0:1], off
+; GFX1250-FAKE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX1250-FAKE16-NEXT: v_not_b32_e32 v4, v4
+; GFX1250-FAKE16-NEXT: .LBB62_1: ; %atomicrmw.start
+; GFX1250-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX1250-FAKE16-NEXT: v_mov_b32_e32 v7, v5
+; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-FAKE16-NEXT: v_lshrrev_b32_e32 v5, v3, v7
+; GFX1250-FAKE16-NEXT: v_lshlrev_b32_e32 v5, 16, v5
+; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-FAKE16-NEXT: v_add_f32_e32 v5, v5, v2
+; GFX1250-FAKE16-NEXT: v_cvt_pk_bf16_f32 v5, v5, s0
+; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff, v5
+; GFX1250-FAKE16-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-FAKE16-NEXT: v_and_or_b32 v6, v7, v4, v5
+; GFX1250-FAKE16-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX1250-FAKE16-NEXT: s_wait_xcnt 0x0
+; GFX1250-FAKE16-NEXT: global_atomic_cmpswap_b32 v5, v[0:1], v[6:7], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX1250-FAKE16-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v7
+; GFX1250-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX1250-FAKE16-NEXT: s_wait_xcnt 0x0
+; GFX1250-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX1250-FAKE16-NEXT: s_cbranch_execnz .LBB62_1
+; GFX1250-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1250-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX1250-FAKE16-NEXT: v_lshrrev_b32_e32 v0, v3, v5
+; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31]
+;
; GFX12-TRUE16-LABEL: global_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX12-TRUE16: ; %bb.0:
; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -16668,6 +18820,100 @@ define bfloat @global_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine
}
define void @global_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, bfloat %val) #0 {
+;
+; GFX1250-TRUE16-LABEL: global_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX1250-TRUE16: ; %bb.0:
+; GFX1250-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX1250-TRUE16-NEXT: v_add_nc_u64_e32 v[4:5], 0x7fe, v[0:1]
+; GFX1250-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX1250-TRUE16-NEXT: v_dual_mov_b32 v1, v5 :: v_dual_bitop2_b32 v0, -4, v4 bitop3:0x40
+; GFX1250-TRUE16-NEXT: v_and_b32_e32 v3, 3, v4
+; GFX1250-TRUE16-NEXT: global_load_b32 v5, v[0:1], off
+; GFX1250-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 3, v3
+; GFX1250-TRUE16-NEXT: v_lshlrev_b32_e64 v4, v3, 0xffff
+; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-TRUE16-NEXT: v_not_b32_e32 v6, v4
+; GFX1250-TRUE16-NEXT: .LBB63_1: ; %atomicrmw.start
+; GFX1250-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX1250-TRUE16-NEXT: v_lshrrev_b32_e32 v4, v3, v5
+; GFX1250-TRUE16-NEXT: v_mov_b16_e32 v7.l, 0
+; GFX1250-TRUE16-NEXT: v_mov_b16_e32 v7.h, v2.l
+; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-TRUE16-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX1250-TRUE16-NEXT: v_add_f32_e32 v4, v4, v7
+; GFX1250-TRUE16-NEXT: v_mov_b16_e32 v7.h, v7.l
+; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-TRUE16-NEXT: v_cvt_pk_bf16_f32 v4, v4, s0
+; GFX1250-TRUE16-NEXT: v_mov_b16_e32 v7.l, v4.l
+; GFX1250-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-TRUE16-NEXT: v_lshlrev_b32_e32 v4, v3, v7
+; GFX1250-TRUE16-NEXT: v_and_or_b32 v4, v5, v6, v4
+; GFX1250-TRUE16-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-TRUE16-NEXT: s_wait_storecnt 0x0
+; GFX1250-TRUE16-NEXT: s_wait_xcnt 0x0
+; GFX1250-TRUE16-NEXT: global_atomic_cmpswap_b32 v4, v[0:1], v[4:5], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX1250-TRUE16-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v5
+; GFX1250-TRUE16-NEXT: s_wait_xcnt 0x0
+; GFX1250-TRUE16-NEXT: v_mov_b32_e32 v5, v4
+; GFX1250-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX1250-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-TRUE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX1250-TRUE16-NEXT: s_cbranch_execnz .LBB63_1
+; GFX1250-TRUE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1250-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX1250-TRUE16-NEXT: s_wait_loadcnt 0x0
+; GFX1250-TRUE16-NEXT: s_set_pc_i64 s[30:31]
+;
+; GFX1250-FAKE16-LABEL: global_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX1250-FAKE16: ; %bb.0:
+; GFX1250-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX1250-FAKE16-NEXT: v_add_nc_u64_e32 v[4:5], 0x7fe, v[0:1]
+; GFX1250-FAKE16-NEXT: s_mov_b32 s0, 0
+; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1250-FAKE16-NEXT: v_dual_lshlrev_b32 v6, 16, v2 :: v_dual_bitop2_b32 v0, -4, v4 bitop3:0x40
+; GFX1250-FAKE16-NEXT: v_dual_mov_b32 v1, v5 :: v_dual_bitop2_b32 v4, 3, v4 bitop3:0x40
+; GFX1250-FAKE16-NEXT: global_load_b32 v3, v[0:1], off
+; GFX1250-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 3, v4
+; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-FAKE16-NEXT: v_lshlrev_b32_e64 v5, v4, 0xffff
+; GFX1250-FAKE16-NEXT: v_not_b32_e32 v5, v5
+; GFX1250-FAKE16-NEXT: .LBB63_1: ; %atomicrmw.start
+; GFX1250-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX1250-FAKE16-NEXT: v_lshrrev_b32_e32 v2, v4, v3
+; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX1250-FAKE16-NEXT: v_add_f32_e32 v2, v2, v6
+; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-FAKE16-NEXT: v_cvt_pk_bf16_f32 v2, v2, s0
+; GFX1250-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX1250-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-FAKE16-NEXT: v_lshlrev_b32_e32 v2, v4, v2
+; GFX1250-FAKE16-NEXT: v_and_or_b32 v2, v3, v5, v2
+; GFX1250-FAKE16-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-FAKE16-NEXT: s_wait_storecnt 0x0
+; GFX1250-FAKE16-NEXT: s_wait_xcnt 0x0
+; GFX1250-FAKE16-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX1250-FAKE16-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX1250-FAKE16-NEXT: s_wait_xcnt 0x0
+; GFX1250-FAKE16-NEXT: v_mov_b32_e32 v3, v2
+; GFX1250-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX1250-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-FAKE16-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX1250-FAKE16-NEXT: s_cbranch_execnz .LBB63_1
+; GFX1250-FAKE16-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1250-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX1250-FAKE16-NEXT: s_wait_loadcnt 0x0
+; GFX1250-FAKE16-NEXT: s_set_pc_i64 s[30:31]
+;
; GFX12-TRUE16-LABEL: global_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX12-TRUE16: ; %bb.0:
; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -17178,6 +19424,18 @@ define void @global_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine
; --------------------------------------------------------------------
define <2 x half> @global_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x half> %val) #0 {
+; GFX1250-LABEL: global_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memory:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+;
; GFX12-LABEL: global_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memory:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -17409,6 +19667,18 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_me
}
define <2 x half> @global_agent_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x half> %val) #0 {
+; GFX1250-LABEL: global_agent_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+;
; GFX12-LABEL: global_agent_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -17642,6 +19912,18 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_
}
define <2 x half> @global_agent_atomic_fadd_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x half> %val) #0 {
+; GFX1250-LABEL: global_agent_atomic_fadd_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+;
; GFX12-LABEL: global_agent_atomic_fadd_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -17879,6 +20161,18 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16__offset12b_neg__amdgpu_no_
}
define void @global_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x half> %val) #0 {
+; GFX1250-LABEL: global_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+;
; GFX12-LABEL: global_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -18088,6 +20382,18 @@ define void @global_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory
}
define void @global_agent_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x half> %val) #0 {
+; GFX1250-LABEL: global_agent_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off offset:2044 scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+;
; GFX12-LABEL: global_agent_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -18300,6 +20606,18 @@ define void @global_agent_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine
}
define void @global_agent_atomic_fadd_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x half> %val) #0 {
+; GFX1250-LABEL: global_agent_atomic_fadd_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off offset:-2048 scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+;
; GFX12-LABEL: global_agent_atomic_fadd_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -18520,6 +20838,18 @@ define void @global_agent_atomic_fadd_noret_v2f16__offset12b_neg__amdgpu_no_fine
}
define <2 x half> @global_system_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x half> %val) #0 {
+; GFX1250-LABEL: global_system_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+;
; GFX12-LABEL: global_system_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -18756,6 +21086,18 @@ define <2 x half> @global_system_atomic_fadd_ret_v2f16__offset12b_pos__amdgpu_no
}
define void @global_system_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x half> %val) #0 {
+; GFX1250-LABEL: global_system_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off offset:2044 scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+;
; GFX12-LABEL: global_system_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -18971,6 +21313,18 @@ define void @global_system_atomic_fadd_noret_v2f16__offset12b_pos__amdgpu_no_fin
}
define <2 x half> @global_agent_atomic_fadd_ret_v2f16__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x half> %val) #0 {
+; GFX1250-LABEL: global_agent_atomic_fadd_ret_v2f16__amdgpu_no_remote_memory:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+;
; GFX12-LABEL: global_agent_atomic_fadd_ret_v2f16__amdgpu_no_remote_memory:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -19216,6 +21570,18 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16__amdgpu_no_remote_memory(p
}
define void @global_agent_atomic_fadd_noret_v2f16__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x half> %val) #0 {
+; GFX1250-LABEL: global_agent_atomic_fadd_noret_v2f16__amdgpu_no_remote_memory:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+;
; GFX12-LABEL: global_agent_atomic_fadd_noret_v2f16__amdgpu_no_remote_memory:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -19451,6 +21817,18 @@ define void @global_agent_atomic_fadd_noret_v2f16__amdgpu_no_remote_memory(ptr a
}
define <2 x half> @global_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x half> %val) #0 {
+; GFX1250-LABEL: global_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+;
; GFX12-LABEL: global_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -19682,6 +22060,18 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16__amdgpu_no_fine_grained_me
}
define void @global_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x half> %val) #0 {
+; GFX1250-LABEL: global_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+;
; GFX12-LABEL: global_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -19891,6 +22281,18 @@ define void @global_agent_atomic_fadd_noret_v2f16__amdgpu_no_fine_grained_memory
}
define <2 x half> @global_agent_atomic_fadd_ret_v2f16__maybe_remote(ptr addrspace(1) %ptr, <2 x half> %val) {
+; GFX1250-LABEL: global_agent_atomic_fadd_ret_v2f16__maybe_remote:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+;
; GFX12-LABEL: global_agent_atomic_fadd_ret_v2f16__maybe_remote:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -20136,6 +22538,18 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16__maybe_remote(ptr addrspac
}
define void @global_agent_atomic_fadd_noret_v2f16__maybe_remote(ptr addrspace(1) %ptr, <2 x half> %val) {
+; GFX1250-LABEL: global_agent_atomic_fadd_noret_v2f16__maybe_remote:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+;
; GFX12-LABEL: global_agent_atomic_fadd_noret_v2f16__maybe_remote:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -20375,6 +22789,18 @@ define void @global_agent_atomic_fadd_noret_v2f16__maybe_remote(ptr addrspace(1)
; --------------------------------------------------------------------
define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 {
+; GFX1250-LABEL: global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_memory:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+;
; GFX12-LABEL: global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_memory:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -20751,6 +23177,18 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained
}
define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 {
+; GFX1250-LABEL: global_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+;
; GFX12-LABEL: global_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -21129,6 +23567,18 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_
}
define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 {
+; GFX1250-LABEL: global_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+;
; GFX12-LABEL: global_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -21511,6 +23961,18 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_
}
define void @global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 {
+; GFX1250-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_atomic_pk_add_bf16 v[0:1], v2, off scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+;
; GFX12-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -21875,6 +24337,18 @@ define void @global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memor
}
define void @global_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 {
+; GFX1250-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_atomic_pk_add_bf16 v[0:1], v2, off offset:2044 scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+;
; GFX12-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -22242,6 +24716,18 @@ define void @global_agent_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fin
}
define void @global_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 {
+; GFX1250-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_atomic_pk_add_bf16 v[0:1], v2, off offset:-2048 scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+;
; GFX12-LABEL: global_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -22617,6 +25103,18 @@ define void @global_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fin
}
define <2 x bfloat> @global_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 {
+; GFX1250-LABEL: global_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off offset:2044 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+;
; GFX12-LABEL: global_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -22998,6 +25496,18 @@ define <2 x bfloat> @global_system_atomic_fadd_ret_v2bf16__offset12b_pos__amdgpu
}
define void @global_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 {
+; GFX1250-LABEL: global_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_wb scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_atomic_pk_add_bf16 v[0:1], v2, off offset:2044 scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_SYS
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+;
; GFX12-LABEL: global_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -23368,6 +25878,18 @@ define void @global_system_atomic_fadd_noret_v2bf16__offset12b_pos__amdgpu_no_fi
}
define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 {
+; GFX1250-LABEL: global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memory:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+;
; GFX12-LABEL: global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memory:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -23744,6 +26266,18 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_remote_memor
}
define void @global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 {
+; GFX1250-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_atomic_pk_add_bf16 v[0:1], v2, off scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+;
; GFX12-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -24108,6 +26642,18 @@ define void @global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_remote_memory(ptr
}
define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 {
+; GFX1250-LABEL: global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+;
; GFX12-LABEL: global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -24484,6 +27030,18 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__amdgpu_no_fine_grained
}
define void @global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x bfloat> %val) #0 {
+; GFX1250-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_atomic_pk_add_bf16 v[0:1], v2, off scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+;
; GFX12-LABEL: global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -24848,6 +27406,18 @@ define void @global_agent_atomic_fadd_noret_v2bf16__amdgpu_no_fine_grained_memor
}
define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__maybe_remote(ptr addrspace(1) %ptr, <2 x bfloat> %val) {
+; GFX1250-LABEL: global_agent_atomic_fadd_ret_v2bf16__maybe_remote:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+;
; GFX12-LABEL: global_agent_atomic_fadd_ret_v2bf16__maybe_remote:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -25224,6 +27794,18 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16__maybe_remote(ptr addrs
}
define void @global_agent_atomic_fadd_noret_v2bf16__maybe_remote(ptr addrspace(1) %ptr, <2 x bfloat> %val) {
+; GFX1250-LABEL: global_agent_atomic_fadd_noret_v2bf16__maybe_remote:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_wb scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_atomic_pk_add_bf16 v[0:1], v2, off scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_storecnt 0x0
+; GFX1250-NEXT: global_inv scope:SCOPE_DEV
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+;
; GFX12-LABEL: global_agent_atomic_fadd_noret_v2bf16__maybe_remote:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
More information about the llvm-commits
mailing list