[llvm] a3fc8ad - [AMDGPU] Add GFX11 test coverage for the memory legalizer
Jay Foad via llvm-commits
llvm-commits at lists.llvm.org
Thu Jun 9 07:42:26 PDT 2022
Author: Jay Foad
Date: 2022-06-09T15:35:56+01:00
New Revision: a3fc8adb7e70df2ac82ae72fb7bb3d8f390cb5ec
URL: https://github.com/llvm/llvm-project/commit/a3fc8adb7e70df2ac82ae72fb7bb3d8f390cb5ec
DIFF: https://github.com/llvm/llvm-project/commit/a3fc8adb7e70df2ac82ae72fb7bb3d8f390cb5ec.diff
LOG: [AMDGPU] Add GFX11 test coverage for the memory legalizer
Added:
Modified:
llvm/test/CodeGen/AMDGPU/memory-legalizer-fence.ll
llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll
llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll
llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-singlethread.ll
llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll
llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll
llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-wavefront.ll
llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll
llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll
llvm/test/CodeGen/AMDGPU/memory-legalizer-global-nontemporal.ll
llvm/test/CodeGen/AMDGPU/memory-legalizer-global-singlethread.ll
llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll
llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll
llvm/test/CodeGen/AMDGPU/memory-legalizer-global-wavefront.ll
llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll
llvm/test/CodeGen/AMDGPU/memory-legalizer-local-agent.ll
llvm/test/CodeGen/AMDGPU/memory-legalizer-local-nontemporal.ll
llvm/test/CodeGen/AMDGPU/memory-legalizer-local-singlethread.ll
llvm/test/CodeGen/AMDGPU/memory-legalizer-local-system.ll
llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll
llvm/test/CodeGen/AMDGPU/memory-legalizer-local-wavefront.ll
llvm/test/CodeGen/AMDGPU/memory-legalizer-local-workgroup.ll
llvm/test/CodeGen/AMDGPU/memory-legalizer-private-nontemporal.ll
llvm/test/CodeGen/AMDGPU/memory-legalizer-private-volatile.ll
Removed:
################################################################################
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence.ll
index 26201f281960..98406ab613ea 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence.ll
@@ -8,6 +8,8 @@
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=+tgsplit -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-TGSPLIT %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-NOTTGSPLIT %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -mattr=+tgsplit -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-TGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11-WGP %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+cumode -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11-CU %s
define amdgpu_kernel void @singlethread_acquire_fence() {
; GFX6-LABEL: singlethread_acquire_fence:
@@ -45,6 +47,14 @@ define amdgpu_kernel void @singlethread_acquire_fence() {
; GFX940-TGSPLIT-LABEL: singlethread_acquire_fence:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: singlethread_acquire_fence:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: singlethread_acquire_fence:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_endpgm
entry:
fence syncscope("singlethread") acquire
ret void
@@ -86,6 +96,14 @@ define amdgpu_kernel void @singlethread_release_fence() {
; GFX940-TGSPLIT-LABEL: singlethread_release_fence:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: singlethread_release_fence:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: singlethread_release_fence:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_endpgm
entry:
fence syncscope("singlethread") release
ret void
@@ -127,6 +145,14 @@ define amdgpu_kernel void @singlethread_acq_rel_fence() {
; GFX940-TGSPLIT-LABEL: singlethread_acq_rel_fence:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: singlethread_acq_rel_fence:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: singlethread_acq_rel_fence:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_endpgm
entry:
fence syncscope("singlethread") acq_rel
ret void
@@ -168,6 +194,14 @@ define amdgpu_kernel void @singlethread_seq_cst_fence() {
; GFX940-TGSPLIT-LABEL: singlethread_seq_cst_fence:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: singlethread_seq_cst_fence:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: singlethread_seq_cst_fence:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_endpgm
entry:
fence syncscope("singlethread") seq_cst
ret void
@@ -209,6 +243,14 @@ define amdgpu_kernel void @singlethread_one_as_acquire_fence() {
; GFX940-TGSPLIT-LABEL: singlethread_one_as_acquire_fence:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: singlethread_one_as_acquire_fence:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: singlethread_one_as_acquire_fence:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_endpgm
entry:
fence syncscope("singlethread-one-as") acquire
ret void
@@ -250,6 +292,14 @@ define amdgpu_kernel void @singlethread_one_as_release_fence() {
; GFX940-TGSPLIT-LABEL: singlethread_one_as_release_fence:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: singlethread_one_as_release_fence:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: singlethread_one_as_release_fence:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_endpgm
entry:
fence syncscope("singlethread-one-as") release
ret void
@@ -291,6 +341,14 @@ define amdgpu_kernel void @singlethread_one_as_acq_rel_fence() {
; GFX940-TGSPLIT-LABEL: singlethread_one_as_acq_rel_fence:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: singlethread_one_as_acq_rel_fence:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: singlethread_one_as_acq_rel_fence:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_endpgm
entry:
fence syncscope("singlethread-one-as") acq_rel
ret void
@@ -332,6 +390,14 @@ define amdgpu_kernel void @singlethread_one_as_seq_cst_fence() {
; GFX940-TGSPLIT-LABEL: singlethread_one_as_seq_cst_fence:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: singlethread_one_as_seq_cst_fence:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: singlethread_one_as_seq_cst_fence:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_endpgm
entry:
fence syncscope("singlethread-one-as") seq_cst
ret void
@@ -373,6 +439,14 @@ define amdgpu_kernel void @wavefront_acquire_fence() {
; GFX940-TGSPLIT-LABEL: wavefront_acquire_fence:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: wavefront_acquire_fence:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: wavefront_acquire_fence:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_endpgm
entry:
fence syncscope("wavefront") acquire
ret void
@@ -414,6 +488,14 @@ define amdgpu_kernel void @wavefront_release_fence() {
; GFX940-TGSPLIT-LABEL: wavefront_release_fence:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: wavefront_release_fence:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: wavefront_release_fence:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_endpgm
entry:
fence syncscope("wavefront") release
ret void
@@ -455,6 +537,14 @@ define amdgpu_kernel void @wavefront_acq_rel_fence() {
; GFX940-TGSPLIT-LABEL: wavefront_acq_rel_fence:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: wavefront_acq_rel_fence:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: wavefront_acq_rel_fence:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_endpgm
entry:
fence syncscope("wavefront") acq_rel
ret void
@@ -496,6 +586,14 @@ define amdgpu_kernel void @wavefront_seq_cst_fence() {
; GFX940-TGSPLIT-LABEL: wavefront_seq_cst_fence:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: wavefront_seq_cst_fence:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: wavefront_seq_cst_fence:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_endpgm
entry:
fence syncscope("wavefront") seq_cst
ret void
@@ -537,6 +635,14 @@ define amdgpu_kernel void @wavefront_one_as_acquire_fence() {
; GFX940-TGSPLIT-LABEL: wavefront_one_as_acquire_fence:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: wavefront_one_as_acquire_fence:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: wavefront_one_as_acquire_fence:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_endpgm
entry:
fence syncscope("wavefront-one-as") acquire
ret void
@@ -578,6 +684,14 @@ define amdgpu_kernel void @wavefront_one_as_release_fence() {
; GFX940-TGSPLIT-LABEL: wavefront_one_as_release_fence:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: wavefront_one_as_release_fence:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: wavefront_one_as_release_fence:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_endpgm
entry:
fence syncscope("wavefront-one-as") release
ret void
@@ -619,6 +733,14 @@ define amdgpu_kernel void @wavefront_one_as_acq_rel_fence() {
; GFX940-TGSPLIT-LABEL: wavefront_one_as_acq_rel_fence:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: wavefront_one_as_acq_rel_fence:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: wavefront_one_as_acq_rel_fence:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_endpgm
entry:
fence syncscope("wavefront-one-as") acq_rel
ret void
@@ -660,6 +782,14 @@ define amdgpu_kernel void @wavefront_one_as_seq_cst_fence() {
; GFX940-TGSPLIT-LABEL: wavefront_one_as_seq_cst_fence:
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: wavefront_one_as_seq_cst_fence:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: wavefront_one_as_seq_cst_fence:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_endpgm
entry:
fence syncscope("wavefront-one-as") seq_cst
ret void
@@ -714,6 +844,18 @@ define amdgpu_kernel void @workgroup_acquire_fence() {
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc0
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: workgroup_acquire_fence:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: workgroup_acquire_fence:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_endpgm
entry:
fence syncscope("workgroup") acquire
ret void
@@ -765,6 +907,17 @@ define amdgpu_kernel void @workgroup_release_fence() {
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: workgroup_release_fence:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: workgroup_release_fence:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_endpgm
entry:
fence syncscope("workgroup") release
ret void
@@ -819,6 +972,18 @@ define amdgpu_kernel void @workgroup_acq_rel_fence() {
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc0
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: workgroup_acq_rel_fence:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: workgroup_acq_rel_fence:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_endpgm
entry:
fence syncscope("workgroup") acq_rel
ret void
@@ -873,6 +1038,18 @@ define amdgpu_kernel void @workgroup_seq_cst_fence() {
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc0
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: workgroup_seq_cst_fence:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: workgroup_seq_cst_fence:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_endpgm
entry:
fence syncscope("workgroup") seq_cst
ret void
@@ -921,6 +1098,17 @@ define amdgpu_kernel void @workgroup_one_as_acquire_fence() {
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc0
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: workgroup_one_as_acquire_fence:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: workgroup_one_as_acquire_fence:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_endpgm
entry:
fence syncscope("workgroup-one-as") acquire
ret void
@@ -966,6 +1154,16 @@ define amdgpu_kernel void @workgroup_one_as_release_fence() {
; GFX940-TGSPLIT: ; %bb.0: ; %entry
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: workgroup_one_as_release_fence:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: workgroup_one_as_release_fence:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_endpgm
entry:
fence syncscope("workgroup-one-as") release
ret void
@@ -1014,6 +1212,17 @@ define amdgpu_kernel void @workgroup_one_as_acq_rel_fence() {
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc0
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: workgroup_one_as_acq_rel_fence:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: workgroup_one_as_acq_rel_fence:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_endpgm
entry:
fence syncscope("workgroup-one-as") acq_rel
ret void
@@ -1062,6 +1271,17 @@ define amdgpu_kernel void @workgroup_one_as_seq_cst_fence() {
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc0
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: workgroup_one_as_seq_cst_fence:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: workgroup_one_as_seq_cst_fence:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_endpgm
entry:
fence syncscope("workgroup-one-as") seq_cst
ret void
@@ -1126,6 +1346,22 @@ define amdgpu_kernel void @agent_acquire_fence() {
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: agent_acquire_fence:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: agent_acquire_fence:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: s_endpgm
entry:
fence syncscope("agent") acquire
ret void
@@ -1180,6 +1416,18 @@ define amdgpu_kernel void @agent_release_fence() {
; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: agent_release_fence:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: agent_release_fence:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: s_endpgm
entry:
fence syncscope("agent") release
ret void
@@ -1244,6 +1492,22 @@ define amdgpu_kernel void @agent_acq_rel_fence() {
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: agent_acq_rel_fence:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: agent_acq_rel_fence:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: s_endpgm
entry:
fence syncscope("agent") acq_rel
ret void
@@ -1308,6 +1572,22 @@ define amdgpu_kernel void @agent_seq_cst_fence() {
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: agent_seq_cst_fence:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: agent_seq_cst_fence:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: s_endpgm
entry:
fence syncscope("agent") seq_cst
ret void
@@ -1372,6 +1652,22 @@ define amdgpu_kernel void @agent_one_as_acquire_fence() {
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: agent_one_as_acquire_fence:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: agent_one_as_acquire_fence:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: s_endpgm
entry:
fence syncscope("agent-one-as") acquire
ret void
@@ -1426,6 +1722,18 @@ define amdgpu_kernel void @agent_one_as_release_fence() {
; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc1
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: agent_one_as_release_fence:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: agent_one_as_release_fence:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: s_endpgm
entry:
fence syncscope("agent-one-as") release
ret void
@@ -1490,6 +1798,22 @@ define amdgpu_kernel void @agent_one_as_acq_rel_fence() {
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: agent_one_as_acq_rel_fence:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: agent_one_as_acq_rel_fence:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: s_endpgm
entry:
fence syncscope("agent-one-as") acq_rel
ret void
@@ -1554,6 +1878,22 @@ define amdgpu_kernel void @agent_one_as_seq_cst_fence() {
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: agent_one_as_seq_cst_fence:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: agent_one_as_seq_cst_fence:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: s_endpgm
entry:
fence syncscope("agent-one-as") seq_cst
ret void
@@ -1622,6 +1962,22 @@ define amdgpu_kernel void @system_acquire_fence() {
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: system_acquire_fence:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: system_acquire_fence:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: s_endpgm
entry:
fence acquire
ret void
@@ -1678,6 +2034,18 @@ define amdgpu_kernel void @system_release_fence() {
; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: system_release_fence:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: system_release_fence:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: s_endpgm
entry:
fence release
ret void
@@ -1746,6 +2114,22 @@ define amdgpu_kernel void @system_acq_rel_fence() {
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: system_acq_rel_fence:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: system_acq_rel_fence:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: s_endpgm
entry:
fence acq_rel
ret void
@@ -1814,6 +2198,22 @@ define amdgpu_kernel void @system_seq_cst_fence() {
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: system_seq_cst_fence:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: system_seq_cst_fence:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: s_endpgm
entry:
fence seq_cst
ret void
@@ -1882,6 +2282,22 @@ define amdgpu_kernel void @system_one_as_acquire_fence() {
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: system_one_as_acquire_fence:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: system_one_as_acquire_fence:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: s_endpgm
entry:
fence syncscope("one-as") acquire
ret void
@@ -1938,6 +2354,18 @@ define amdgpu_kernel void @system_one_as_release_fence() {
; GFX940-TGSPLIT-NEXT: buffer_wbl2 sc0 sc1
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: system_one_as_release_fence:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: system_one_as_release_fence:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: s_endpgm
entry:
fence syncscope("one-as") release
ret void
@@ -2006,6 +2434,22 @@ define amdgpu_kernel void @system_one_as_acq_rel_fence() {
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: system_one_as_acq_rel_fence:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: system_one_as_acq_rel_fence:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: s_endpgm
entry:
fence syncscope("one-as") acq_rel
ret void
@@ -2074,6 +2518,22 @@ define amdgpu_kernel void @system_one_as_seq_cst_fence() {
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: system_one_as_seq_cst_fence:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: system_one_as_seq_cst_fence:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: s_endpgm
entry:
fence syncscope("one-as") seq_cst
ret void
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll
index 56f1ec99daba..f9cfe767be84 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll
@@ -7,6 +7,8 @@
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=+tgsplit -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-TGSPLIT %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-NOTTGSPLIT %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -mattr=+tgsplit -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-TGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11-WGP %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+cumode -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11-CU %s
define amdgpu_kernel void @flat_agent_unordered_load(
; GFX7-LABEL: flat_agent_unordered_load:
@@ -112,6 +114,32 @@ define amdgpu_kernel void @flat_agent_unordered_load(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_agent_unordered_load:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1]
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_agent_unordered_load:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1]
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %in, i32* %out) {
entry:
%val = load atomic i32, i32* %in syncscope("agent") unordered, align 4
@@ -223,6 +251,32 @@ define amdgpu_kernel void @flat_agent_monotonic_load(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_agent_monotonic_load:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] glc dlc
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_agent_monotonic_load:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] glc dlc
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %in, i32* %out) {
entry:
%val = load atomic i32, i32* %in syncscope("agent") monotonic, align 4
@@ -343,6 +397,36 @@ define amdgpu_kernel void @flat_agent_acquire_load(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s3
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_agent_acquire_load:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] glc dlc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_agent_acquire_load:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] glc dlc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %in, i32* %out) {
entry:
%val = load atomic i32, i32* %in syncscope("agent") acquire, align 4
@@ -473,6 +557,40 @@ define amdgpu_kernel void @flat_agent_seq_cst_load(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s3
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_agent_seq_cst_load:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] glc dlc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_agent_seq_cst_load:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] glc dlc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %in, i32* %out) {
entry:
%val = load atomic i32, i32* %in syncscope("agent") seq_cst, align 4
@@ -566,6 +684,30 @@ define amdgpu_kernel void @flat_agent_unordered_store(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_agent_unordered_store:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_agent_unordered_store:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32 %in, i32* %out) {
entry:
store atomic i32 %in, i32* %out syncscope("agent") unordered, align 4
@@ -658,6 +800,30 @@ define amdgpu_kernel void @flat_agent_monotonic_store(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_agent_monotonic_store:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_agent_monotonic_store:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32 %in, i32* %out) {
entry:
store atomic i32 %in, i32* %out syncscope("agent") monotonic, align 4
@@ -762,6 +928,34 @@ define amdgpu_kernel void @flat_agent_release_store(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_agent_release_store:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_agent_release_store:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32 %in, i32* %out) {
entry:
store atomic i32 %in, i32* %out syncscope("agent") release, align 4
@@ -866,6 +1060,34 @@ define amdgpu_kernel void @flat_agent_seq_cst_store(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_agent_seq_cst_store:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_agent_seq_cst_store:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32 %in, i32* %out) {
entry:
store atomic i32 %in, i32* %out syncscope("agent") seq_cst, align 4
@@ -958,6 +1180,30 @@ define amdgpu_kernel void @flat_agent_monotonic_atomicrmw(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
; GFX940-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_agent_monotonic_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_agent_monotonic_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("agent") monotonic
@@ -1069,6 +1315,38 @@ define amdgpu_kernel void @flat_agent_acquire_atomicrmw(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_agent_acquire_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_agent_acquire_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("agent") acquire
@@ -1173,6 +1451,34 @@ define amdgpu_kernel void @flat_agent_release_atomicrmw(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_agent_release_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_agent_release_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("agent") release
@@ -1296,6 +1602,42 @@ define amdgpu_kernel void @flat_agent_acq_rel_atomicrmw(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_agent_acq_rel_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_agent_acq_rel_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("agent") acq_rel
@@ -1419,6 +1761,42 @@ define amdgpu_kernel void @flat_agent_seq_cst_atomicrmw(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_agent_seq_cst_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_agent_seq_cst_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("agent") seq_cst
@@ -1536,6 +1914,38 @@ define amdgpu_kernel void @flat_agent_acquire_ret_atomicrmw(
; GFX940-TGSPLIT-NEXT: buffer_inv sc1
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_agent_acquire_ret_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_agent_acquire_ret_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("agent") acquire
@@ -1666,6 +2076,42 @@ define amdgpu_kernel void @flat_agent_acq_rel_ret_atomicrmw(
; GFX940-TGSPLIT-NEXT: buffer_inv sc1
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_agent_acq_rel_ret_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_agent_acq_rel_ret_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("agent") acq_rel
@@ -1796,6 +2242,42 @@ define amdgpu_kernel void @flat_agent_seq_cst_ret_atomicrmw(
; GFX940-TGSPLIT-NEXT: buffer_inv sc1
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_agent_seq_cst_ret_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_agent_seq_cst_ret_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("agent") seq_cst
@@ -1901,6 +2383,32 @@ define amdgpu_kernel void @flat_agent_monotonic_monotonic_cmpxchg(
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_agent_monotonic_monotonic_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_agent_monotonic_monotonic_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -2025,6 +2533,40 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_agent_acquire_monotonic_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_agent_acquire_monotonic_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -2142,6 +2684,36 @@ define amdgpu_kernel void @flat_agent_release_monotonic_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_agent_release_monotonic_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_agent_release_monotonic_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -2278,6 +2850,44 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_agent_acq_rel_monotonic_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_agent_acq_rel_monotonic_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -2414,6 +3024,44 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_agent_seq_cst_monotonic_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_agent_seq_cst_monotonic_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -2538,6 +3186,40 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_agent_monotonic_acquire_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_agent_monotonic_acquire_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -2662,6 +3344,40 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_agent_acquire_acquire_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_agent_acquire_acquire_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -2798,6 +3514,44 @@ define amdgpu_kernel void @flat_agent_release_acquire_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_agent_release_acquire_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_agent_release_acquire_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -2934,6 +3688,44 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_agent_acq_rel_acquire_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_agent_acq_rel_acquire_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -3070,6 +3862,44 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_agent_seq_cst_acquire_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_agent_seq_cst_acquire_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -3206,6 +4036,44 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_agent_monotonic_seq_cst_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_agent_monotonic_seq_cst_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -3342,6 +4210,44 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_agent_acquire_seq_cst_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_agent_acquire_seq_cst_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -3478,6 +4384,44 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_agent_release_seq_cst_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_agent_release_seq_cst_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -3614,6 +4558,44 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_agent_acq_rel_seq_cst_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_agent_acq_rel_seq_cst_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -3750,6 +4732,44 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_agent_seq_cst_seq_cst_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_agent_seq_cst_seq_cst_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -3879,6 +4899,36 @@ define amdgpu_kernel void @flat_agent_monotonic_monotonic_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_agent_monotonic_monotonic_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_agent_monotonic_monotonic_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -4019,6 +5069,40 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: buffer_inv sc1
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_agent_acquire_monotonic_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_agent_acquire_monotonic_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -4162,6 +5246,40 @@ define amdgpu_kernel void @flat_agent_release_monotonic_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_agent_release_monotonic_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_agent_release_monotonic_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -4314,6 +5432,44 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: buffer_inv sc1
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_agent_acq_rel_monotonic_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_agent_acq_rel_monotonic_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -4466,6 +5622,44 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: buffer_inv sc1
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_agent_seq_cst_monotonic_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_agent_seq_cst_monotonic_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -4606,6 +5800,40 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: buffer_inv sc1
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_agent_monotonic_acquire_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_agent_monotonic_acquire_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -4746,6 +5974,40 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: buffer_inv sc1
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_agent_acquire_acquire_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_agent_acquire_acquire_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -4898,6 +6160,44 @@ define amdgpu_kernel void @flat_agent_release_acquire_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: buffer_inv sc1
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_agent_release_acquire_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_agent_release_acquire_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -5050,6 +6350,44 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: buffer_inv sc1
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_agent_acq_rel_acquire_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_agent_acq_rel_acquire_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -5202,6 +6540,44 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: buffer_inv sc1
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_agent_seq_cst_acquire_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_agent_seq_cst_acquire_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -5354,6 +6730,44 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: buffer_inv sc1
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_agent_monotonic_seq_cst_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_agent_monotonic_seq_cst_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -5506,6 +6920,44 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: buffer_inv sc1
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_agent_acquire_seq_cst_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_agent_acquire_seq_cst_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -5658,6 +7110,44 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: buffer_inv sc1
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_agent_release_seq_cst_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_agent_release_seq_cst_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -5810,6 +7300,44 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: buffer_inv sc1
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_agent_acq_rel_seq_cst_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_agent_acq_rel_seq_cst_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -5962,6 +7490,44 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: buffer_inv sc1
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_agent_seq_cst_seq_cst_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_agent_seq_cst_seq_cst_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -6075,6 +7641,32 @@ define amdgpu_kernel void @flat_agent_one_as_unordered_load(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_agent_one_as_unordered_load:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1]
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_agent_one_as_unordered_load:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1]
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %in, i32* %out) {
entry:
%val = load atomic i32, i32* %in syncscope("agent-one-as") unordered, align 4
@@ -6186,6 +7778,32 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_load(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_agent_one_as_monotonic_load:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] glc dlc
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_agent_one_as_monotonic_load:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] glc dlc
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %in, i32* %out) {
entry:
%val = load atomic i32, i32* %in syncscope("agent-one-as") monotonic, align 4
@@ -6312,6 +7930,38 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_load(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s3
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_agent_one_as_acquire_load:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] glc dlc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_agent_one_as_acquire_load:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] glc dlc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %in, i32* %out) {
entry:
%val = load atomic i32, i32* %in syncscope("agent-one-as") acquire, align 4
@@ -6448,6 +8098,42 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_load(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s3
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_agent_one_as_seq_cst_load:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] glc dlc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_agent_one_as_seq_cst_load:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] glc dlc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %in, i32* %out) {
entry:
%val = load atomic i32, i32* %in syncscope("agent-one-as") seq_cst, align 4
@@ -6541,6 +8227,30 @@ define amdgpu_kernel void @flat_agent_one_as_unordered_store(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_agent_one_as_unordered_store:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_agent_one_as_unordered_store:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32 %in, i32* %out) {
entry:
store atomic i32 %in, i32* %out syncscope("agent-one-as") unordered, align 4
@@ -6633,6 +8343,30 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_store(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_agent_one_as_monotonic_store:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_agent_one_as_monotonic_store:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32 %in, i32* %out) {
entry:
store atomic i32 %in, i32* %out syncscope("agent-one-as") monotonic, align 4
@@ -6737,6 +8471,34 @@ define amdgpu_kernel void @flat_agent_one_as_release_store(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_agent_one_as_release_store:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_agent_one_as_release_store:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32 %in, i32* %out) {
entry:
store atomic i32 %in, i32* %out syncscope("agent-one-as") release, align 4
@@ -6841,6 +8603,34 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_store(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_agent_one_as_seq_cst_store:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_agent_one_as_seq_cst_store:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32 %in, i32* %out) {
entry:
store atomic i32 %in, i32* %out syncscope("agent-one-as") seq_cst, align 4
@@ -6933,6 +8723,30 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_atomicrmw(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
; GFX940-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_agent_one_as_monotonic_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_agent_one_as_monotonic_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("agent-one-as") monotonic
@@ -7042,6 +8856,36 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_atomicrmw(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_agent_one_as_acquire_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_agent_one_as_acquire_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("agent-one-as") acquire
@@ -7146,6 +8990,34 @@ define amdgpu_kernel void @flat_agent_one_as_release_atomicrmw(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_agent_one_as_release_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_agent_one_as_release_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("agent-one-as") release
@@ -7267,6 +9139,40 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_atomicrmw(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_agent_one_as_acq_rel_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_agent_one_as_acq_rel_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("agent-one-as") acq_rel
@@ -7388,6 +9294,40 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_atomicrmw(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_agent_one_as_seq_cst_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_agent_one_as_seq_cst_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("agent-one-as") seq_cst
@@ -7510,6 +9450,40 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_ret_atomicrmw(
; GFX940-TGSPLIT-NEXT: buffer_inv sc1
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_agent_one_as_acquire_ret_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_agent_one_as_acquire_ret_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("agent-one-as") acquire
@@ -7645,6 +9619,44 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_ret_atomicrmw(
; GFX940-TGSPLIT-NEXT: buffer_inv sc1
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_agent_one_as_acq_rel_ret_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_agent_one_as_acq_rel_ret_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("agent-one-as") acq_rel
@@ -7780,6 +9792,44 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_ret_atomicrmw(
; GFX940-TGSPLIT-NEXT: buffer_inv sc1
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_agent_one_as_seq_cst_ret_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_agent_one_as_seq_cst_ret_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("agent-one-as") seq_cst
@@ -7885,6 +9935,32 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_cmpxchg(
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_agent_one_as_monotonic_monotonic_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_agent_one_as_monotonic_monotonic_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -8007,6 +10083,38 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_agent_one_as_acquire_monotonic_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_agent_one_as_acquire_monotonic_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -8124,6 +10232,36 @@ define amdgpu_kernel void @flat_agent_one_as_release_monotonic_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_agent_one_as_release_monotonic_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_agent_one_as_release_monotonic_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -8258,6 +10396,42 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_agent_one_as_acq_rel_monotonic_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_agent_one_as_acq_rel_monotonic_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -8392,6 +10566,42 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_agent_one_as_seq_cst_monotonic_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_agent_one_as_seq_cst_monotonic_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -8514,6 +10724,38 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_agent_one_as_monotonic_acquire_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_agent_one_as_monotonic_acquire_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -8636,6 +10878,38 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_agent_one_as_acquire_acquire_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_agent_one_as_acquire_acquire_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -8770,6 +11044,42 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_agent_one_as_release_acquire_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_agent_one_as_release_acquire_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -8904,6 +11214,42 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_agent_one_as_acq_rel_acquire_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_agent_one_as_acq_rel_acquire_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -9038,6 +11384,42 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_agent_one_as_seq_cst_acquire_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_agent_one_as_seq_cst_acquire_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -9172,6 +11554,42 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_agent_one_as_monotonic_seq_cst_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_agent_one_as_monotonic_seq_cst_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -9306,6 +11724,42 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_agent_one_as_acquire_seq_cst_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_agent_one_as_acquire_seq_cst_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -9440,6 +11894,42 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_agent_one_as_release_seq_cst_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_agent_one_as_release_seq_cst_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -9574,6 +12064,42 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_agent_one_as_acq_rel_seq_cst_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_agent_one_as_acq_rel_seq_cst_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -9708,6 +12234,42 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_agent_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_agent_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -9837,6 +12399,36 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_agent_one_as_monotonic_monotonic_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_agent_one_as_monotonic_monotonic_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -9983,6 +12575,42 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: buffer_inv sc1
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_agent_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_agent_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -10126,6 +12754,40 @@ define amdgpu_kernel void @flat_agent_one_as_release_monotonic_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_agent_one_as_release_monotonic_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_agent_one_as_release_monotonic_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -10284,6 +12946,46 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: buffer_inv sc1
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -10442,6 +13144,46 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: buffer_inv sc1
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -10588,6 +13330,42 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: buffer_inv sc1
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_agent_one_as_monotonic_acquire_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_agent_one_as_monotonic_acquire_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -10734,6 +13512,42 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: buffer_inv sc1
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_agent_one_as_acquire_acquire_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_agent_one_as_acquire_acquire_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -10892,6 +13706,46 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: buffer_inv sc1
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_agent_one_as_release_acquire_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_agent_one_as_release_acquire_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -11050,6 +13904,46 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: buffer_inv sc1
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_agent_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_agent_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -11208,6 +14102,46 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: buffer_inv sc1
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_agent_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_agent_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -11366,6 +14300,46 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: buffer_inv sc1
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -11524,6 +14498,46 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: buffer_inv sc1
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_agent_one_as_acquire_seq_cst_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_agent_one_as_acquire_seq_cst_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -11682,6 +14696,46 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: buffer_inv sc1
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_agent_one_as_release_seq_cst_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_agent_one_as_release_seq_cst_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -11840,6 +14894,46 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: buffer_inv sc1
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -11998,6 +15092,46 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: buffer_inv sc1
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll
index 57b49ddd843d..20d5b1431730 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll
@@ -7,6 +7,8 @@
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=+tgsplit -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-TGSPLIT %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-NOTTGSPLIT %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -mattr=+tgsplit -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-TGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11-WGP %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+cumode -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11-CU %s
define amdgpu_kernel void @flat_nontemporal_load_0(
; GFX7-LABEL: flat_nontemporal_load_0:
@@ -112,6 +114,32 @@ define amdgpu_kernel void @flat_nontemporal_load_0(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_nontemporal_load_0:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] slc
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_nontemporal_load_0:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] slc
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %in, i32* %out) {
entry:
%val = load i32, i32* %in, align 4, !nontemporal !0
@@ -235,6 +263,34 @@ define amdgpu_kernel void @flat_nontemporal_load_1(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_nontemporal_load_1:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_add_co_u32 v0, s0, s0, v0
+; GFX11-WGP-NEXT: v_add_co_ci_u32_e64 v1, s0, s1, 0, s0
+; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] slc
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_nontemporal_load_1:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_add_co_u32 v0, s0, s0, v0
+; GFX11-CU-NEXT: v_add_co_ci_u32_e64 v1, s0, s1, 0, s0
+; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] slc
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %in, i32* %out) {
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -348,6 +404,32 @@ define amdgpu_kernel void @flat_nontemporal_store_0(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 nt
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_nontemporal_store_0:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1]
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 glc slc
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_nontemporal_store_0:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1]
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 glc slc
+; GFX11-CU-NEXT: s_endpgm
i32* %in, i32* %out) {
entry:
%val = load i32, i32* %in, align 4
@@ -471,6 +553,34 @@ define amdgpu_kernel void @flat_nontemporal_store_1(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 nt
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_nontemporal_store_1:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s1
+; GFX11-WGP-NEXT: v_add_co_u32 v0, s0, s2, v0
+; GFX11-WGP-NEXT: flat_load_b32 v2, v[1:2]
+; GFX11-WGP-NEXT: v_add_co_ci_u32_e64 v1, s0, s3, 0, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 glc slc
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_nontemporal_store_1:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1
+; GFX11-CU-NEXT: v_add_co_u32 v0, s0, s2, v0
+; GFX11-CU-NEXT: flat_load_b32 v2, v[1:2]
+; GFX11-CU-NEXT: v_add_co_ci_u32_e64 v1, s0, s3, 0, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 glc slc
+; GFX11-CU-NEXT: s_endpgm
i32* %in, i32* %out) {
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-singlethread.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-singlethread.ll
index b2b148435068..08b230d818f6 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-singlethread.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-singlethread.ll
@@ -7,6 +7,8 @@
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=+tgsplit -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-TGSPLIT %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-NOTTGSPLIT %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -mattr=+tgsplit -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-TGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11-WGP %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+cumode -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11-CU %s
define amdgpu_kernel void @flat_singlethread_unordered_load(
; GFX7-LABEL: flat_singlethread_unordered_load:
@@ -112,6 +114,32 @@ define amdgpu_kernel void @flat_singlethread_unordered_load(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_singlethread_unordered_load:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1]
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_singlethread_unordered_load:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1]
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %in, i32* %out) {
entry:
%val = load atomic i32, i32* %in syncscope("singlethread") unordered, align 4
@@ -223,6 +251,32 @@ define amdgpu_kernel void @flat_singlethread_monotonic_load(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_singlethread_monotonic_load:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1]
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_singlethread_monotonic_load:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1]
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %in, i32* %out) {
entry:
%val = load atomic i32, i32* %in syncscope("singlethread") monotonic, align 4
@@ -334,6 +388,32 @@ define amdgpu_kernel void @flat_singlethread_acquire_load(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_singlethread_acquire_load:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1]
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_singlethread_acquire_load:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1]
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %in, i32* %out) {
entry:
%val = load atomic i32, i32* %in syncscope("singlethread") acquire, align 4
@@ -445,6 +525,32 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_load(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_singlethread_seq_cst_load:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1]
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_singlethread_seq_cst_load:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1]
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %in, i32* %out) {
entry:
%val = load atomic i32, i32* %in syncscope("singlethread") seq_cst, align 4
@@ -538,6 +644,30 @@ define amdgpu_kernel void @flat_singlethread_unordered_store(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_singlethread_unordered_store:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_singlethread_unordered_store:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32 %in, i32* %out) {
entry:
store atomic i32 %in, i32* %out syncscope("singlethread") unordered, align 4
@@ -630,6 +760,30 @@ define amdgpu_kernel void @flat_singlethread_monotonic_store(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_singlethread_monotonic_store:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_singlethread_monotonic_store:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32 %in, i32* %out) {
entry:
store atomic i32 %in, i32* %out syncscope("singlethread") monotonic, align 4
@@ -722,6 +876,30 @@ define amdgpu_kernel void @flat_singlethread_release_store(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_singlethread_release_store:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_singlethread_release_store:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32 %in, i32* %out) {
entry:
store atomic i32 %in, i32* %out syncscope("singlethread") release, align 4
@@ -814,6 +992,30 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_store(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_singlethread_seq_cst_store:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_singlethread_seq_cst_store:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32 %in, i32* %out) {
entry:
store atomic i32 %in, i32* %out syncscope("singlethread") seq_cst, align 4
@@ -906,6 +1108,30 @@ define amdgpu_kernel void @flat_singlethread_monotonic_atomicrmw(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
; GFX940-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_singlethread_monotonic_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_singlethread_monotonic_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("singlethread") monotonic
@@ -998,6 +1224,30 @@ define amdgpu_kernel void @flat_singlethread_acquire_atomicrmw(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
; GFX940-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_singlethread_acquire_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_singlethread_acquire_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("singlethread") acquire
@@ -1090,6 +1340,30 @@ define amdgpu_kernel void @flat_singlethread_release_atomicrmw(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
; GFX940-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_singlethread_release_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_singlethread_release_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("singlethread") release
@@ -1182,6 +1456,30 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_atomicrmw(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
; GFX940-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_singlethread_acq_rel_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_singlethread_acq_rel_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("singlethread") acq_rel
@@ -1274,6 +1572,30 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_atomicrmw(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
; GFX940-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_singlethread_seq_cst_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_singlethread_seq_cst_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("singlethread") seq_cst
@@ -1382,6 +1704,34 @@ define amdgpu_kernel void @flat_singlethread_acquire_ret_atomicrmw(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_singlethread_acquire_ret_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_singlethread_acquire_ret_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("singlethread") acquire
@@ -1491,6 +1841,34 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_ret_atomicrmw(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_singlethread_acq_rel_ret_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_singlethread_acq_rel_ret_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("singlethread") acq_rel
@@ -1600,6 +1978,34 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_ret_atomicrmw(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_singlethread_seq_cst_ret_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_singlethread_seq_cst_ret_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("singlethread") seq_cst
@@ -1705,6 +2111,32 @@ define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_cmpxchg(
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_singlethread_monotonic_monotonic_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_singlethread_monotonic_monotonic_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -1810,6 +2242,32 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_cmpxchg(
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_singlethread_acquire_monotonic_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_singlethread_acquire_monotonic_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -1915,6 +2373,32 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_cmpxchg(
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_singlethread_release_monotonic_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_singlethread_release_monotonic_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -2020,6 +2504,32 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_cmpxchg(
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_singlethread_acq_rel_monotonic_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_singlethread_acq_rel_monotonic_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -2125,6 +2635,32 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_cmpxchg(
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_singlethread_seq_cst_monotonic_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_singlethread_seq_cst_monotonic_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -2230,6 +2766,32 @@ define amdgpu_kernel void @flat_singlethread_monotonic_acquire_cmpxchg(
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_singlethread_monotonic_acquire_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_singlethread_monotonic_acquire_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -2335,6 +2897,32 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_cmpxchg(
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_singlethread_acquire_acquire_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_singlethread_acquire_acquire_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -2440,6 +3028,32 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_cmpxchg(
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_singlethread_release_acquire_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_singlethread_release_acquire_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -2545,6 +3159,32 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_cmpxchg(
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_singlethread_acq_rel_acquire_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_singlethread_acq_rel_acquire_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -2650,6 +3290,32 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_cmpxchg(
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_singlethread_seq_cst_acquire_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_singlethread_seq_cst_acquire_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -2755,6 +3421,32 @@ define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_cmpxchg(
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_singlethread_monotonic_seq_cst_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_singlethread_monotonic_seq_cst_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -2860,6 +3552,32 @@ define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_cmpxchg(
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_singlethread_acquire_seq_cst_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_singlethread_acquire_seq_cst_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -2965,6 +3683,32 @@ define amdgpu_kernel void @flat_singlethread_release_seq_cst_cmpxchg(
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_singlethread_release_seq_cst_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_singlethread_release_seq_cst_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -3070,6 +3814,32 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_cmpxchg(
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_singlethread_acq_rel_seq_cst_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_singlethread_acq_rel_seq_cst_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -3175,6 +3945,32 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_cmpxchg(
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_singlethread_seq_cst_seq_cst_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_singlethread_seq_cst_seq_cst_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -3304,6 +4100,36 @@ define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_singlethread_monotonic_monotonic_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_singlethread_monotonic_monotonic_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -3435,6 +4261,36 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_singlethread_acquire_monotonic_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_singlethread_acquire_monotonic_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -3566,6 +4422,36 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_singlethread_release_monotonic_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_singlethread_release_monotonic_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -3697,6 +4583,36 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_singlethread_acq_rel_monotonic_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_singlethread_acq_rel_monotonic_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -3828,6 +4744,36 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_singlethread_seq_cst_monotonic_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_singlethread_seq_cst_monotonic_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -3959,6 +4905,36 @@ define amdgpu_kernel void @flat_singlethread_monotonic_acquire_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_singlethread_monotonic_acquire_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_singlethread_monotonic_acquire_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -4090,6 +5066,36 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_singlethread_acquire_acquire_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_singlethread_acquire_acquire_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -4221,6 +5227,36 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_singlethread_release_acquire_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_singlethread_release_acquire_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -4352,6 +5388,36 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_singlethread_acq_rel_acquire_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_singlethread_acq_rel_acquire_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -4483,6 +5549,36 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_singlethread_seq_cst_acquire_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_singlethread_seq_cst_acquire_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -4614,6 +5710,36 @@ define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_singlethread_monotonic_seq_cst_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_singlethread_monotonic_seq_cst_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -4745,6 +5871,36 @@ define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_singlethread_acquire_seq_cst_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_singlethread_acquire_seq_cst_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -4876,6 +6032,36 @@ define amdgpu_kernel void @flat_singlethread_release_seq_cst_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_singlethread_release_seq_cst_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_singlethread_release_seq_cst_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -5007,6 +6193,36 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_singlethread_acq_rel_seq_cst_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_singlethread_acq_rel_seq_cst_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -5138,6 +6354,36 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_singlethread_seq_cst_seq_cst_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_singlethread_seq_cst_seq_cst_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -5251,6 +6497,32 @@ define amdgpu_kernel void @flat_singlethread_one_as_unordered_load(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_singlethread_one_as_unordered_load:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1]
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_singlethread_one_as_unordered_load:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1]
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %in, i32* %out) {
entry:
%val = load atomic i32, i32* %in syncscope("singlethread-one-as") unordered, align 4
@@ -5362,6 +6634,32 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_load(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_singlethread_one_as_monotonic_load:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1]
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_singlethread_one_as_monotonic_load:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1]
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %in, i32* %out) {
entry:
%val = load atomic i32, i32* %in syncscope("singlethread-one-as") monotonic, align 4
@@ -5473,6 +6771,32 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_load(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_singlethread_one_as_acquire_load:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1]
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_singlethread_one_as_acquire_load:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1]
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %in, i32* %out) {
entry:
%val = load atomic i32, i32* %in syncscope("singlethread-one-as") acquire, align 4
@@ -5584,6 +6908,32 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_load(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_singlethread_one_as_seq_cst_load:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1]
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_singlethread_one_as_seq_cst_load:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1]
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %in, i32* %out) {
entry:
%val = load atomic i32, i32* %in syncscope("singlethread-one-as") seq_cst, align 4
@@ -5677,6 +7027,30 @@ define amdgpu_kernel void @flat_singlethread_one_as_unordered_store(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_singlethread_one_as_unordered_store:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_singlethread_one_as_unordered_store:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32 %in, i32* %out) {
entry:
store atomic i32 %in, i32* %out syncscope("singlethread-one-as") unordered, align 4
@@ -5769,6 +7143,30 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_store(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_singlethread_one_as_monotonic_store:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_singlethread_one_as_monotonic_store:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32 %in, i32* %out) {
entry:
store atomic i32 %in, i32* %out syncscope("singlethread-one-as") monotonic, align 4
@@ -5861,6 +7259,30 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_store(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_singlethread_one_as_release_store:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_singlethread_one_as_release_store:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32 %in, i32* %out) {
entry:
store atomic i32 %in, i32* %out syncscope("singlethread-one-as") release, align 4
@@ -5953,6 +7375,30 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_store(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_singlethread_one_as_seq_cst_store:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_singlethread_one_as_seq_cst_store:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32 %in, i32* %out) {
entry:
store atomic i32 %in, i32* %out syncscope("singlethread-one-as") seq_cst, align 4
@@ -6045,6 +7491,30 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_atomicrmw(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
; GFX940-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_singlethread_one_as_monotonic_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_singlethread_one_as_monotonic_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("singlethread-one-as") monotonic
@@ -6137,6 +7607,30 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_atomicrmw(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
; GFX940-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_singlethread_one_as_acquire_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_singlethread_one_as_acquire_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("singlethread-one-as") acquire
@@ -6229,6 +7723,30 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_atomicrmw(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
; GFX940-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_singlethread_one_as_release_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_singlethread_one_as_release_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("singlethread-one-as") release
@@ -6321,6 +7839,30 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_atomicrmw(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
; GFX940-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_singlethread_one_as_acq_rel_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_singlethread_one_as_acq_rel_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("singlethread-one-as") acq_rel
@@ -6413,6 +7955,30 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_atomicrmw(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
; GFX940-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_singlethread_one_as_seq_cst_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_singlethread_one_as_seq_cst_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("singlethread-one-as") seq_cst
@@ -6521,6 +8087,34 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_ret_atomicrmw(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_singlethread_one_as_acquire_ret_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_singlethread_one_as_acquire_ret_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("singlethread-one-as") acquire
@@ -6630,6 +8224,34 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_ret_atomicrmw(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_singlethread_one_as_acq_rel_ret_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_singlethread_one_as_acq_rel_ret_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("singlethread-one-as") acq_rel
@@ -6739,6 +8361,34 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_ret_atomicrmw(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_singlethread_one_as_seq_cst_ret_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_singlethread_one_as_seq_cst_ret_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("singlethread-one-as") seq_cst
@@ -6844,6 +8494,32 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_cmpxchg(
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_singlethread_one_as_monotonic_monotonic_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_singlethread_one_as_monotonic_monotonic_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -6949,6 +8625,32 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_cmpxchg(
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_singlethread_one_as_acquire_monotonic_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_singlethread_one_as_acquire_monotonic_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -7054,6 +8756,32 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_cmpxchg(
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_singlethread_one_as_release_monotonic_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_singlethread_one_as_release_monotonic_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -7159,6 +8887,32 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_cmpxchg(
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_singlethread_one_as_acq_rel_monotonic_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_singlethread_one_as_acq_rel_monotonic_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -7264,6 +9018,32 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_cmpxchg(
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_singlethread_one_as_seq_cst_monotonic_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_singlethread_one_as_seq_cst_monotonic_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -7369,6 +9149,32 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_cmpxchg(
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_singlethread_one_as_monotonic_acquire_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_singlethread_one_as_monotonic_acquire_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -7474,6 +9280,32 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_cmpxchg(
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_singlethread_one_as_acquire_acquire_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_singlethread_one_as_acquire_acquire_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -7579,6 +9411,32 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_cmpxchg(
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_singlethread_one_as_release_acquire_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_singlethread_one_as_release_acquire_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -7684,6 +9542,32 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_cmpxchg(
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_singlethread_one_as_acq_rel_acquire_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_singlethread_one_as_acq_rel_acquire_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -7789,6 +9673,32 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_cmpxchg(
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_singlethread_one_as_seq_cst_acquire_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_singlethread_one_as_seq_cst_acquire_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -7894,6 +9804,32 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_cmpxchg(
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_singlethread_one_as_monotonic_seq_cst_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_singlethread_one_as_monotonic_seq_cst_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -7999,6 +9935,32 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_cmpxchg(
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_singlethread_one_as_acquire_seq_cst_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_singlethread_one_as_acquire_seq_cst_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -8104,6 +10066,32 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_cmpxchg(
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_singlethread_one_as_release_seq_cst_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_singlethread_one_as_release_seq_cst_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -8209,6 +10197,32 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg(
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -8314,6 +10328,32 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg(
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -8443,6 +10483,36 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_ret_cmpx
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_singlethread_one_as_monotonic_monotonic_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_singlethread_one_as_monotonic_monotonic_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -8574,6 +10644,36 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_ret_cmpxch
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_singlethread_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_singlethread_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -8705,6 +10805,36 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_ret_cmpxch
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_singlethread_one_as_release_monotonic_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_singlethread_one_as_release_monotonic_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -8836,6 +10966,36 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxch
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -8967,6 +11127,36 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxch
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -9098,6 +11288,36 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_ret_cmpxch
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_singlethread_one_as_monotonic_acquire_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_singlethread_one_as_monotonic_acquire_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -9229,6 +11449,36 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_singlethread_one_as_acquire_acquire_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_singlethread_one_as_acquire_acquire_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -9360,6 +11610,36 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_singlethread_one_as_release_acquire_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_singlethread_one_as_release_acquire_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -9491,6 +11771,36 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -9622,6 +11932,36 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -9753,6 +12093,36 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxch
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -9884,6 +12254,36 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -10015,6 +12415,36 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_singlethread_one_as_release_seq_cst_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_singlethread_one_as_release_seq_cst_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -10146,6 +12576,36 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -10277,6 +12737,36 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll
index f9318d31b68a..fe80c6692395 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll
@@ -7,6 +7,8 @@
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=+tgsplit -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-TGSPLIT %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-NOTTGSPLIT %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -mattr=+tgsplit -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-TGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11-WGP %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+cumode -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11-CU %s
define amdgpu_kernel void @flat_system_unordered_load(
; GFX7-LABEL: flat_system_unordered_load:
@@ -112,6 +114,32 @@ define amdgpu_kernel void @flat_system_unordered_load(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_system_unordered_load:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1]
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_system_unordered_load:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1]
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %in, i32* %out) {
entry:
%val = load atomic i32, i32* %in unordered, align 4
@@ -223,6 +251,32 @@ define amdgpu_kernel void @flat_system_monotonic_load(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_system_monotonic_load:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] glc dlc
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_system_monotonic_load:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] glc dlc
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %in, i32* %out) {
entry:
%val = load atomic i32, i32* %in monotonic, align 4
@@ -345,6 +399,36 @@ define amdgpu_kernel void @flat_system_acquire_load(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s3
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_system_acquire_load:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] glc dlc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_system_acquire_load:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] glc dlc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %in, i32* %out) {
entry:
%val = load atomic i32, i32* %in acquire, align 4
@@ -477,6 +561,40 @@ define amdgpu_kernel void @flat_system_seq_cst_load(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s3
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_system_seq_cst_load:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] glc dlc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_system_seq_cst_load:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] glc dlc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %in, i32* %out) {
entry:
%val = load atomic i32, i32* %in seq_cst, align 4
@@ -570,6 +688,30 @@ define amdgpu_kernel void @flat_system_unordered_store(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_system_unordered_store:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_system_unordered_store:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32 %in, i32* %out) {
entry:
store atomic i32 %in, i32* %out unordered, align 4
@@ -662,6 +804,30 @@ define amdgpu_kernel void @flat_system_monotonic_store(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_system_monotonic_store:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_system_monotonic_store:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32 %in, i32* %out) {
entry:
store atomic i32 %in, i32* %out monotonic, align 4
@@ -768,6 +934,34 @@ define amdgpu_kernel void @flat_system_release_store(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_system_release_store:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_system_release_store:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32 %in, i32* %out) {
entry:
store atomic i32 %in, i32* %out release, align 4
@@ -874,6 +1068,34 @@ define amdgpu_kernel void @flat_system_seq_cst_store(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_system_seq_cst_store:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_system_seq_cst_store:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32 %in, i32* %out) {
entry:
store atomic i32 %in, i32* %out seq_cst, align 4
@@ -966,6 +1188,30 @@ define amdgpu_kernel void @flat_system_monotonic_atomicrmw(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
; GFX940-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_system_monotonic_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_system_monotonic_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32* %out, i32 %in monotonic
@@ -1079,6 +1325,38 @@ define amdgpu_kernel void @flat_system_acquire_atomicrmw(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_system_acquire_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_system_acquire_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32* %out, i32 %in acquire
@@ -1185,6 +1463,34 @@ define amdgpu_kernel void @flat_system_release_atomicrmw(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_system_release_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_system_release_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32* %out, i32 %in release
@@ -1312,6 +1618,42 @@ define amdgpu_kernel void @flat_system_acq_rel_atomicrmw(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_system_acq_rel_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_system_acq_rel_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32* %out, i32 %in acq_rel
@@ -1439,6 +1781,42 @@ define amdgpu_kernel void @flat_system_seq_cst_atomicrmw(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_system_seq_cst_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_system_seq_cst_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32* %out, i32 %in seq_cst
@@ -1558,6 +1936,38 @@ define amdgpu_kernel void @flat_system_acquire_ret_atomicrmw(
; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_system_acquire_ret_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_system_acquire_ret_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32* %out, i32 %in acquire
@@ -1692,6 +2102,42 @@ define amdgpu_kernel void @flat_system_acq_rel_ret_atomicrmw(
; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_system_acq_rel_ret_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_system_acq_rel_ret_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32* %out, i32 %in acq_rel
@@ -1826,6 +2272,42 @@ define amdgpu_kernel void @flat_system_seq_cst_ret_atomicrmw(
; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_system_seq_cst_ret_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_system_seq_cst_ret_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32* %out, i32 %in seq_cst
@@ -1931,6 +2413,32 @@ define amdgpu_kernel void @flat_system_monotonic_monotonic_cmpxchg(
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_system_monotonic_monotonic_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_system_monotonic_monotonic_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -2057,6 +2565,40 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_system_acquire_monotonic_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_system_acquire_monotonic_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -2176,6 +2718,36 @@ define amdgpu_kernel void @flat_system_release_monotonic_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_system_release_monotonic_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_system_release_monotonic_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -2316,6 +2888,44 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_system_acq_rel_monotonic_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_system_acq_rel_monotonic_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -2456,6 +3066,44 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_system_seq_cst_monotonic_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_system_seq_cst_monotonic_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -2582,6 +3230,40 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_system_monotonic_acquire_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_system_monotonic_acquire_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -2708,6 +3390,40 @@ define amdgpu_kernel void @flat_system_acquire_acquire_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_system_acquire_acquire_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_system_acquire_acquire_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -2848,6 +3564,44 @@ define amdgpu_kernel void @flat_system_release_acquire_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_system_release_acquire_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_system_release_acquire_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -2988,6 +3742,44 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_system_acq_rel_acquire_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_system_acq_rel_acquire_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -3128,6 +3920,44 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_system_seq_cst_acquire_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_system_seq_cst_acquire_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -3268,6 +4098,44 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_system_monotonic_seq_cst_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_system_monotonic_seq_cst_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -3408,6 +4276,44 @@ define amdgpu_kernel void @flat_system_acquire_seq_cst_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_system_acquire_seq_cst_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_system_acquire_seq_cst_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -3548,6 +4454,44 @@ define amdgpu_kernel void @flat_system_release_seq_cst_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_system_release_seq_cst_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_system_release_seq_cst_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -3688,6 +4632,44 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_system_acq_rel_seq_cst_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_system_acq_rel_seq_cst_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -3828,6 +4810,44 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_system_seq_cst_seq_cst_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_system_seq_cst_seq_cst_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -3957,6 +4977,36 @@ define amdgpu_kernel void @flat_system_monotonic_monotonic_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_system_monotonic_monotonic_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_system_monotonic_monotonic_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -4099,6 +5149,40 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_system_acquire_monotonic_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_system_acquire_monotonic_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -4244,6 +5328,40 @@ define amdgpu_kernel void @flat_system_release_monotonic_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_system_release_monotonic_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_system_release_monotonic_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -4400,6 +5518,44 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_system_acq_rel_monotonic_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_system_acq_rel_monotonic_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -4556,6 +5712,44 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_system_seq_cst_monotonic_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_system_seq_cst_monotonic_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -4698,6 +5892,40 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_system_monotonic_acquire_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_system_monotonic_acquire_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -4840,6 +6068,40 @@ define amdgpu_kernel void @flat_system_acquire_acquire_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_system_acquire_acquire_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_system_acquire_acquire_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -4996,6 +6258,44 @@ define amdgpu_kernel void @flat_system_release_acquire_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_system_release_acquire_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_system_release_acquire_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -5152,6 +6452,44 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_system_acq_rel_acquire_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_system_acq_rel_acquire_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -5308,6 +6646,44 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_system_seq_cst_acquire_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_system_seq_cst_acquire_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -5464,6 +6840,44 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_system_monotonic_seq_cst_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_system_monotonic_seq_cst_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -5620,6 +7034,44 @@ define amdgpu_kernel void @flat_system_acquire_seq_cst_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_system_acquire_seq_cst_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_system_acquire_seq_cst_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -5776,6 +7228,44 @@ define amdgpu_kernel void @flat_system_release_seq_cst_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_system_release_seq_cst_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_system_release_seq_cst_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -5932,6 +7422,44 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_system_acq_rel_seq_cst_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_system_acq_rel_seq_cst_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -6088,6 +7616,44 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_system_seq_cst_seq_cst_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_system_seq_cst_seq_cst_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -6201,6 +7767,32 @@ define amdgpu_kernel void @flat_system_one_as_unordered_load(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_system_one_as_unordered_load:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1]
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_system_one_as_unordered_load:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1]
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %in, i32* %out) {
entry:
%val = load atomic i32, i32* %in syncscope("one-as") unordered, align 4
@@ -6312,6 +7904,32 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_load(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_system_one_as_monotonic_load:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] glc dlc
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_system_one_as_monotonic_load:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] glc dlc
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %in, i32* %out) {
entry:
%val = load atomic i32, i32* %in syncscope("one-as") monotonic, align 4
@@ -6440,6 +8058,38 @@ define amdgpu_kernel void @flat_system_one_as_acquire_load(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s3
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_system_one_as_acquire_load:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] glc dlc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_system_one_as_acquire_load:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] glc dlc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %in, i32* %out) {
entry:
%val = load atomic i32, i32* %in syncscope("one-as") acquire, align 4
@@ -6578,6 +8228,42 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_load(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s3
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_system_one_as_seq_cst_load:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] glc dlc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_system_one_as_seq_cst_load:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] glc dlc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %in, i32* %out) {
entry:
%val = load atomic i32, i32* %in syncscope("one-as") seq_cst, align 4
@@ -6671,6 +8357,30 @@ define amdgpu_kernel void @flat_system_one_as_unordered_store(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_system_one_as_unordered_store:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_system_one_as_unordered_store:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32 %in, i32* %out) {
entry:
store atomic i32 %in, i32* %out syncscope("one-as") unordered, align 4
@@ -6763,6 +8473,30 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_store(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_system_one_as_monotonic_store:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_system_one_as_monotonic_store:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32 %in, i32* %out) {
entry:
store atomic i32 %in, i32* %out syncscope("one-as") monotonic, align 4
@@ -6869,6 +8603,34 @@ define amdgpu_kernel void @flat_system_one_as_release_store(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_system_one_as_release_store:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_system_one_as_release_store:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32 %in, i32* %out) {
entry:
store atomic i32 %in, i32* %out syncscope("one-as") release, align 4
@@ -6975,6 +8737,34 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_store(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_system_one_as_seq_cst_store:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_system_one_as_seq_cst_store:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32 %in, i32* %out) {
entry:
store atomic i32 %in, i32* %out syncscope("one-as") seq_cst, align 4
@@ -7067,6 +8857,30 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_atomicrmw(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
; GFX940-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_system_one_as_monotonic_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_system_one_as_monotonic_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("one-as") monotonic
@@ -7178,6 +8992,36 @@ define amdgpu_kernel void @flat_system_one_as_acquire_atomicrmw(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_system_one_as_acquire_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_system_one_as_acquire_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("one-as") acquire
@@ -7284,6 +9128,34 @@ define amdgpu_kernel void @flat_system_one_as_release_atomicrmw(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2 sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_system_one_as_release_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_system_one_as_release_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("one-as") release
@@ -7409,6 +9281,40 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_atomicrmw(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_system_one_as_acq_rel_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_system_one_as_acq_rel_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("one-as") acq_rel
@@ -7534,6 +9440,40 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_atomicrmw(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_system_one_as_seq_cst_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_system_one_as_seq_cst_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("one-as") seq_cst
@@ -7658,6 +9598,40 @@ define amdgpu_kernel void @flat_system_one_as_acquire_ret_atomicrmw(
; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_system_one_as_acquire_ret_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_system_one_as_acquire_ret_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("one-as") acquire
@@ -7797,6 +9771,44 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_ret_atomicrmw(
; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_system_one_as_acq_rel_ret_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_system_one_as_acq_rel_ret_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("one-as") acq_rel
@@ -7936,6 +9948,44 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_ret_atomicrmw(
; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_system_one_as_seq_cst_ret_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_system_one_as_seq_cst_ret_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("one-as") seq_cst
@@ -8041,6 +10091,32 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_cmpxchg(
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_system_one_as_monotonic_monotonic_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_system_one_as_monotonic_monotonic_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -8165,6 +10241,38 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_system_one_as_acquire_monotonic_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_system_one_as_acquire_monotonic_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -8284,6 +10392,36 @@ define amdgpu_kernel void @flat_system_one_as_release_monotonic_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16 sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_system_one_as_release_monotonic_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_system_one_as_release_monotonic_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -8422,6 +10560,42 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_system_one_as_acq_rel_monotonic_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_system_one_as_acq_rel_monotonic_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -8560,6 +10734,42 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_system_one_as_seq_cst_monotonic_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_system_one_as_seq_cst_monotonic_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -8684,6 +10894,38 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_system_one_as_monotonic_acquire_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_system_one_as_monotonic_acquire_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -8808,6 +11050,38 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_system_one_as_acquire_acquire_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_system_one_as_acquire_acquire_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -8946,6 +11220,42 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_system_one_as_release_acquire_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_system_one_as_release_acquire_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -9084,6 +11394,42 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_system_one_as_acq_rel_acquire_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_system_one_as_acq_rel_acquire_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -9222,6 +11568,42 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_system_one_as_seq_cst_acquire_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_system_one_as_seq_cst_acquire_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -9360,6 +11742,42 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_system_one_as_monotonic_seq_cst_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_system_one_as_monotonic_seq_cst_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -9498,6 +11916,42 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_system_one_as_acquire_seq_cst_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_system_one_as_acquire_seq_cst_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -9636,6 +12090,42 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_system_one_as_release_seq_cst_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_system_one_as_release_seq_cst_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -9774,6 +12264,42 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_system_one_as_acq_rel_seq_cst_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_system_one_as_acq_rel_seq_cst_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -9912,6 +12438,42 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_system_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_system_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -10041,6 +12603,36 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_system_one_as_monotonic_monotonic_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_system_one_as_monotonic_monotonic_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -10189,6 +12781,42 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_system_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_system_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -10334,6 +12962,40 @@ define amdgpu_kernel void @flat_system_one_as_release_monotonic_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_system_one_as_release_monotonic_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_system_one_as_release_monotonic_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -10496,6 +13158,46 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_system_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_system_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -10658,6 +13360,46 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_system_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_system_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -10806,6 +13548,42 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_system_one_as_monotonic_acquire_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_system_one_as_monotonic_acquire_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -10954,6 +13732,42 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_system_one_as_acquire_acquire_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_system_one_as_acquire_acquire_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -11116,6 +13930,46 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_system_one_as_release_acquire_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_system_one_as_release_acquire_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -11278,6 +14132,46 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_system_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_system_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -11440,6 +14334,46 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_system_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_system_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -11602,6 +14536,46 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_system_one_as_monotonic_seq_cst_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_system_one_as_monotonic_seq_cst_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -11764,6 +14738,46 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_system_one_as_acquire_seq_cst_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_system_one_as_acquire_seq_cst_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -11926,6 +14940,46 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_system_one_as_release_seq_cst_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_system_one_as_release_seq_cst_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -12088,6 +15142,46 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -12250,6 +15344,46 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll
index 88a5ccb5de9d..a6ec811c1255 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll
@@ -3,6 +3,8 @@
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-WGP %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+cumode -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-CU %s
; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx700 -amdgcn-skip-cache-invalidations -verify-machineinstrs < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11-WGP %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+cumode -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11-CU %s
define amdgpu_kernel void @flat_nontemporal_load_0(
; GFX7-LABEL: flat_nontemporal_load_0:
@@ -60,6 +62,34 @@ define amdgpu_kernel void @flat_nontemporal_load_0(
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2
; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_nontemporal_load_0:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] glc dlc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_nontemporal_load_0:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] glc dlc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %in, i32* %out) {
entry:
%val = load volatile i32, i32* %in, align 4
@@ -129,6 +159,36 @@ define amdgpu_kernel void @flat_nontemporal_load_1(
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2
; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_nontemporal_load_1:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_add_co_u32 v0, s0, s0, v0
+; GFX11-WGP-NEXT: v_add_co_ci_u32_e64 v1, s0, s1, 0, s0
+; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] glc dlc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_nontemporal_load_1:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_add_co_u32 v0, s0, s0, v0
+; GFX11-CU-NEXT: v_add_co_ci_u32_e64 v1, s0, s1, 0, s0
+; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] glc dlc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %in, i32* %out) {
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -194,6 +254,34 @@ define amdgpu_kernel void @flat_nontemporal_store_0(
; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_nontemporal_store_0:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1]
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_nontemporal_store_0:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1]
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: s_endpgm
i32* %in, i32* %out) {
entry:
%val = load i32, i32* %in, align 4
@@ -263,6 +351,36 @@ define amdgpu_kernel void @flat_nontemporal_store_1(
; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_nontemporal_store_1:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s1
+; GFX11-WGP-NEXT: v_add_co_u32 v0, s0, s2, v0
+; GFX11-WGP-NEXT: flat_load_b32 v2, v[1:2]
+; GFX11-WGP-NEXT: v_add_co_ci_u32_e64 v1, s0, s3, 0, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_nontemporal_store_1:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1
+; GFX11-CU-NEXT: v_add_co_u32 v0, s0, s2, v0
+; GFX11-CU-NEXT: flat_load_b32 v2, v[1:2]
+; GFX11-CU-NEXT: v_add_co_ci_u32_e64 v1, s0, s3, 0, s0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: s_endpgm
i32* %in, i32* %out) {
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -328,6 +446,34 @@ define amdgpu_kernel void @flat_volatile_workgroup_acquire_load(
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2
; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_volatile_workgroup_acquire_load:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_volatile_workgroup_acquire_load:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1]
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %in, i32* %out) {
entry:
%val = load atomic volatile i32, i32* %in syncscope("workgroup") acquire, align 4
@@ -386,6 +532,33 @@ define amdgpu_kernel void @flat_volatile_workgroup_release_store(
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2
; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_volatile_workgroup_release_store:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_volatile_workgroup_release_store:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32 %in, i32* %out) {
entry:
store atomic volatile i32 %in, i32* %out syncscope("workgroup") release, align 4
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-wavefront.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-wavefront.ll
index 062f981230ba..7b1b5a1899c0 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-wavefront.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-wavefront.ll
@@ -7,6 +7,8 @@
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=+tgsplit -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-TGSPLIT %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-NOTTGSPLIT %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -mattr=+tgsplit -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-TGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11-WGP %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+cumode -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11-CU %s
define amdgpu_kernel void @flat_wavefront_unordered_load(
; GFX7-LABEL: flat_wavefront_unordered_load:
@@ -112,6 +114,32 @@ define amdgpu_kernel void @flat_wavefront_unordered_load(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_wavefront_unordered_load:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1]
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_wavefront_unordered_load:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1]
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %in, i32* %out) {
entry:
%val = load atomic i32, i32* %in syncscope("wavefront") unordered, align 4
@@ -223,6 +251,32 @@ define amdgpu_kernel void @flat_wavefront_monotonic_load(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_wavefront_monotonic_load:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1]
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_wavefront_monotonic_load:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1]
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %in, i32* %out) {
entry:
%val = load atomic i32, i32* %in syncscope("wavefront") monotonic, align 4
@@ -334,6 +388,32 @@ define amdgpu_kernel void @flat_wavefront_acquire_load(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_wavefront_acquire_load:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1]
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_wavefront_acquire_load:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1]
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %in, i32* %out) {
entry:
%val = load atomic i32, i32* %in syncscope("wavefront") acquire, align 4
@@ -445,6 +525,32 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_load(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_wavefront_seq_cst_load:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1]
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_wavefront_seq_cst_load:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1]
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %in, i32* %out) {
entry:
%val = load atomic i32, i32* %in syncscope("wavefront") seq_cst, align 4
@@ -538,6 +644,30 @@ define amdgpu_kernel void @flat_wavefront_unordered_store(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_wavefront_unordered_store:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_wavefront_unordered_store:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32 %in, i32* %out) {
entry:
store atomic i32 %in, i32* %out syncscope("wavefront") unordered, align 4
@@ -630,6 +760,30 @@ define amdgpu_kernel void @flat_wavefront_monotonic_store(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_wavefront_monotonic_store:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_wavefront_monotonic_store:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32 %in, i32* %out) {
entry:
store atomic i32 %in, i32* %out syncscope("wavefront") monotonic, align 4
@@ -722,6 +876,30 @@ define amdgpu_kernel void @flat_wavefront_release_store(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_wavefront_release_store:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_wavefront_release_store:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32 %in, i32* %out) {
entry:
store atomic i32 %in, i32* %out syncscope("wavefront") release, align 4
@@ -814,6 +992,30 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_store(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_wavefront_seq_cst_store:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_wavefront_seq_cst_store:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32 %in, i32* %out) {
entry:
store atomic i32 %in, i32* %out syncscope("wavefront") seq_cst, align 4
@@ -906,6 +1108,30 @@ define amdgpu_kernel void @flat_wavefront_monotonic_atomicrmw(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
; GFX940-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_wavefront_monotonic_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_wavefront_monotonic_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("wavefront") monotonic
@@ -998,6 +1224,30 @@ define amdgpu_kernel void @flat_wavefront_acquire_atomicrmw(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
; GFX940-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_wavefront_acquire_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_wavefront_acquire_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("wavefront") acquire
@@ -1090,6 +1340,30 @@ define amdgpu_kernel void @flat_wavefront_release_atomicrmw(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
; GFX940-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_wavefront_release_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_wavefront_release_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("wavefront") release
@@ -1182,6 +1456,30 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_atomicrmw(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
; GFX940-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_wavefront_acq_rel_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_wavefront_acq_rel_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("wavefront") acq_rel
@@ -1274,6 +1572,30 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_atomicrmw(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
; GFX940-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_wavefront_seq_cst_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_wavefront_seq_cst_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("wavefront") seq_cst
@@ -1382,6 +1704,34 @@ define amdgpu_kernel void @flat_wavefront_acquire_ret_atomicrmw(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_wavefront_acquire_ret_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_wavefront_acquire_ret_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("wavefront") acquire
@@ -1491,6 +1841,34 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_ret_atomicrmw(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_wavefront_acq_rel_ret_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_wavefront_acq_rel_ret_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("wavefront") acq_rel
@@ -1600,6 +1978,34 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_ret_atomicrmw(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_wavefront_seq_cst_ret_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_wavefront_seq_cst_ret_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("wavefront") seq_cst
@@ -1705,6 +2111,32 @@ define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_cmpxchg(
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_wavefront_monotonic_monotonic_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_wavefront_monotonic_monotonic_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -1810,6 +2242,32 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_cmpxchg(
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_wavefront_acquire_monotonic_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_wavefront_acquire_monotonic_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -1915,6 +2373,32 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_cmpxchg(
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_wavefront_release_monotonic_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_wavefront_release_monotonic_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -2020,6 +2504,32 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_cmpxchg(
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_wavefront_acq_rel_monotonic_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_wavefront_acq_rel_monotonic_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -2125,6 +2635,32 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_cmpxchg(
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_wavefront_seq_cst_monotonic_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_wavefront_seq_cst_monotonic_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -2230,6 +2766,32 @@ define amdgpu_kernel void @flat_wavefront_monotonic_acquire_cmpxchg(
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_wavefront_monotonic_acquire_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_wavefront_monotonic_acquire_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -2335,6 +2897,32 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_cmpxchg(
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_wavefront_acquire_acquire_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_wavefront_acquire_acquire_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -2440,6 +3028,32 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_cmpxchg(
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_wavefront_release_acquire_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_wavefront_release_acquire_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -2545,6 +3159,32 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_cmpxchg(
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_wavefront_acq_rel_acquire_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_wavefront_acq_rel_acquire_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -2650,6 +3290,32 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_cmpxchg(
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_wavefront_seq_cst_acquire_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_wavefront_seq_cst_acquire_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -2755,6 +3421,32 @@ define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_cmpxchg(
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_wavefront_monotonic_seq_cst_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_wavefront_monotonic_seq_cst_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -2860,6 +3552,32 @@ define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_cmpxchg(
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_wavefront_acquire_seq_cst_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_wavefront_acquire_seq_cst_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -2965,6 +3683,32 @@ define amdgpu_kernel void @flat_wavefront_release_seq_cst_cmpxchg(
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_wavefront_release_seq_cst_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_wavefront_release_seq_cst_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -3070,6 +3814,32 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_cmpxchg(
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_wavefront_acq_rel_seq_cst_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_wavefront_acq_rel_seq_cst_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -3175,6 +3945,32 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_cmpxchg(
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_wavefront_seq_cst_seq_cst_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_wavefront_seq_cst_seq_cst_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -3304,6 +4100,36 @@ define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_wavefront_monotonic_monotonic_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_wavefront_monotonic_monotonic_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -3435,6 +4261,36 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_wavefront_acquire_monotonic_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_wavefront_acquire_monotonic_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -3566,6 +4422,36 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_wavefront_release_monotonic_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_wavefront_release_monotonic_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -3697,6 +4583,36 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_wavefront_acq_rel_monotonic_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_wavefront_acq_rel_monotonic_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -3828,6 +4744,36 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_wavefront_seq_cst_monotonic_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_wavefront_seq_cst_monotonic_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -3959,6 +4905,36 @@ define amdgpu_kernel void @flat_wavefront_monotonic_acquire_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_wavefront_monotonic_acquire_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_wavefront_monotonic_acquire_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -4090,6 +5066,36 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_wavefront_acquire_acquire_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_wavefront_acquire_acquire_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -4221,6 +5227,36 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_wavefront_release_acquire_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_wavefront_release_acquire_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -4352,6 +5388,36 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_wavefront_acq_rel_acquire_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_wavefront_acq_rel_acquire_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -4483,6 +5549,36 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_wavefront_seq_cst_acquire_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_wavefront_seq_cst_acquire_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -4614,6 +5710,36 @@ define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_wavefront_monotonic_seq_cst_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_wavefront_monotonic_seq_cst_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -4745,6 +5871,36 @@ define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_wavefront_acquire_seq_cst_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_wavefront_acquire_seq_cst_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -4876,6 +6032,36 @@ define amdgpu_kernel void @flat_wavefront_release_seq_cst_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_wavefront_release_seq_cst_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_wavefront_release_seq_cst_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -5007,6 +6193,36 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_wavefront_acq_rel_seq_cst_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_wavefront_acq_rel_seq_cst_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -5138,6 +6354,36 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_wavefront_seq_cst_seq_cst_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_wavefront_seq_cst_seq_cst_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -5251,6 +6497,32 @@ define amdgpu_kernel void @flat_wavefront_one_as_unordered_load(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_wavefront_one_as_unordered_load:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1]
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_wavefront_one_as_unordered_load:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1]
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %in, i32* %out) {
entry:
%val = load atomic i32, i32* %in syncscope("wavefront-one-as") unordered, align 4
@@ -5362,6 +6634,32 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_load(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_wavefront_one_as_monotonic_load:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1]
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_wavefront_one_as_monotonic_load:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1]
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %in, i32* %out) {
entry:
%val = load atomic i32, i32* %in syncscope("wavefront-one-as") monotonic, align 4
@@ -5473,6 +6771,32 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_load(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_wavefront_one_as_acquire_load:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1]
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_wavefront_one_as_acquire_load:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1]
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %in, i32* %out) {
entry:
%val = load atomic i32, i32* %in syncscope("wavefront-one-as") acquire, align 4
@@ -5584,6 +6908,32 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_load(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_wavefront_one_as_seq_cst_load:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1]
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_wavefront_one_as_seq_cst_load:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1]
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %in, i32* %out) {
entry:
%val = load atomic i32, i32* %in syncscope("wavefront-one-as") seq_cst, align 4
@@ -5677,6 +7027,30 @@ define amdgpu_kernel void @flat_wavefront_one_as_unordered_store(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_wavefront_one_as_unordered_store:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_wavefront_one_as_unordered_store:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32 %in, i32* %out) {
entry:
store atomic i32 %in, i32* %out syncscope("wavefront-one-as") unordered, align 4
@@ -5769,6 +7143,30 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_store(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_wavefront_one_as_monotonic_store:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_wavefront_one_as_monotonic_store:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32 %in, i32* %out) {
entry:
store atomic i32 %in, i32* %out syncscope("wavefront-one-as") monotonic, align 4
@@ -5861,6 +7259,30 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_store(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_wavefront_one_as_release_store:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_wavefront_one_as_release_store:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32 %in, i32* %out) {
entry:
store atomic i32 %in, i32* %out syncscope("wavefront-one-as") release, align 4
@@ -5953,6 +7375,30 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_store(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_wavefront_one_as_seq_cst_store:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_wavefront_one_as_seq_cst_store:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32 %in, i32* %out) {
entry:
store atomic i32 %in, i32* %out syncscope("wavefront-one-as") seq_cst, align 4
@@ -6045,6 +7491,30 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_atomicrmw(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
; GFX940-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_wavefront_one_as_monotonic_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_wavefront_one_as_monotonic_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("wavefront-one-as") monotonic
@@ -6137,6 +7607,30 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_atomicrmw(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
; GFX940-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_wavefront_one_as_acquire_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_wavefront_one_as_acquire_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("wavefront-one-as") acquire
@@ -6229,6 +7723,30 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_atomicrmw(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
; GFX940-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_wavefront_one_as_release_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_wavefront_one_as_release_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("wavefront-one-as") release
@@ -6321,6 +7839,30 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_atomicrmw(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
; GFX940-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_wavefront_one_as_acq_rel_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_wavefront_one_as_acq_rel_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("wavefront-one-as") acq_rel
@@ -6413,6 +7955,30 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_atomicrmw(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
; GFX940-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_wavefront_one_as_seq_cst_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_wavefront_one_as_seq_cst_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("wavefront-one-as") seq_cst
@@ -6521,6 +8087,34 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_ret_atomicrmw(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_wavefront_one_as_acquire_ret_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_wavefront_one_as_acquire_ret_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("wavefront-one-as") acquire
@@ -6630,6 +8224,34 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_ret_atomicrmw(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_wavefront_one_as_acq_rel_ret_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_wavefront_one_as_acq_rel_ret_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("wavefront-one-as") acq_rel
@@ -6739,6 +8361,34 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_ret_atomicrmw(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_wavefront_one_as_seq_cst_ret_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_wavefront_one_as_seq_cst_ret_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("wavefront-one-as") seq_cst
@@ -6844,6 +8494,32 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_cmpxchg(
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_wavefront_one_as_monotonic_monotonic_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_wavefront_one_as_monotonic_monotonic_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -6949,6 +8625,32 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_cmpxchg(
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_wavefront_one_as_acquire_monotonic_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_wavefront_one_as_acquire_monotonic_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -7054,6 +8756,32 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_monotonic_cmpxchg(
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_wavefront_one_as_release_monotonic_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_wavefront_one_as_release_monotonic_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -7159,6 +8887,32 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_cmpxchg(
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_wavefront_one_as_acq_rel_monotonic_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_wavefront_one_as_acq_rel_monotonic_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -7264,6 +9018,32 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_cmpxchg(
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_wavefront_one_as_seq_cst_monotonic_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_wavefront_one_as_seq_cst_monotonic_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -7369,6 +9149,32 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_cmpxchg(
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_wavefront_one_as_monotonic_acquire_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_wavefront_one_as_monotonic_acquire_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -7474,6 +9280,32 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_cmpxchg(
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_wavefront_one_as_acquire_acquire_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_wavefront_one_as_acquire_acquire_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -7579,6 +9411,32 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_cmpxchg(
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_wavefront_one_as_release_acquire_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_wavefront_one_as_release_acquire_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -7684,6 +9542,32 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_cmpxchg(
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_wavefront_one_as_acq_rel_acquire_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_wavefront_one_as_acq_rel_acquire_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -7789,6 +9673,32 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_cmpxchg(
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_wavefront_one_as_seq_cst_acquire_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_wavefront_one_as_seq_cst_acquire_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -7894,6 +9804,32 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_cmpxchg(
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_wavefront_one_as_monotonic_seq_cst_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_wavefront_one_as_monotonic_seq_cst_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -7999,6 +9935,32 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_cmpxchg(
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_wavefront_one_as_acquire_seq_cst_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_wavefront_one_as_acquire_seq_cst_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -8104,6 +10066,32 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_cmpxchg(
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_wavefront_one_as_release_seq_cst_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_wavefront_one_as_release_seq_cst_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -8209,6 +10197,32 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg(
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -8314,6 +10328,32 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg(
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -8443,6 +10483,36 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -8574,6 +10644,36 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -8705,6 +10805,36 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -8836,6 +10966,36 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -8967,6 +11127,36 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -9098,6 +11288,36 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_wavefront_one_as_acquire_acquire_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_wavefront_one_as_acquire_acquire_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -9229,6 +11449,36 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_wavefront_one_as_release_acquire_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_wavefront_one_as_release_acquire_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -9360,6 +11610,36 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -9491,6 +11771,36 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -9622,6 +11932,36 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -9753,6 +12093,36 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -9884,6 +12254,36 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_wavefront_one_as_release_seq_cst_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_wavefront_one_as_release_seq_cst_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -10015,6 +12415,36 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -10146,6 +12576,36 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll
index cc06b9be0d5a..efc7a8294b04 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll
@@ -7,6 +7,8 @@
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=+tgsplit -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-TGSPLIT %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-NOTTGSPLIT %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -mattr=+tgsplit -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-TGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11-WGP %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+cumode -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11-CU %s
define amdgpu_kernel void @flat_workgroup_unordered_load(
; GFX7-LABEL: flat_workgroup_unordered_load:
@@ -112,6 +114,32 @@ define amdgpu_kernel void @flat_workgroup_unordered_load(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_workgroup_unordered_load:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1]
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_workgroup_unordered_load:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1]
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %in, i32* %out) {
entry:
%val = load atomic i32, i32* %in syncscope("workgroup") unordered, align 4
@@ -223,6 +251,32 @@ define amdgpu_kernel void @flat_workgroup_monotonic_load(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_workgroup_monotonic_load:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] glc
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_workgroup_monotonic_load:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1]
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %in, i32* %out) {
entry:
%val = load atomic i32, i32* %in syncscope("workgroup") monotonic, align 4
@@ -342,6 +396,34 @@ define amdgpu_kernel void @flat_workgroup_acquire_load(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s3
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_workgroup_acquire_load:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_workgroup_acquire_load:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1]
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %in, i32* %out) {
entry:
%val = load atomic i32, i32* %in syncscope("workgroup") acquire, align 4
@@ -470,6 +552,37 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_load(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s3
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_workgroup_seq_cst_load:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_workgroup_seq_cst_load:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1]
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %in, i32* %out) {
entry:
%val = load atomic i32, i32* %in syncscope("workgroup") seq_cst, align 4
@@ -563,6 +676,30 @@ define amdgpu_kernel void @flat_workgroup_unordered_store(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_workgroup_unordered_store:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_workgroup_unordered_store:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32 %in, i32* %out) {
entry:
store atomic i32 %in, i32* %out syncscope("workgroup") unordered, align 4
@@ -655,6 +792,30 @@ define amdgpu_kernel void @flat_workgroup_monotonic_store(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_workgroup_monotonic_store:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_workgroup_monotonic_store:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32 %in, i32* %out) {
entry:
store atomic i32 %in, i32* %out syncscope("workgroup") monotonic, align 4
@@ -756,6 +917,33 @@ define amdgpu_kernel void @flat_workgroup_release_store(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_workgroup_release_store:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_workgroup_release_store:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32 %in, i32* %out) {
entry:
store atomic i32 %in, i32* %out syncscope("workgroup") release, align 4
@@ -857,6 +1045,33 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_store(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_workgroup_seq_cst_store:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_workgroup_seq_cst_store:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32 %in, i32* %out) {
entry:
store atomic i32 %in, i32* %out syncscope("workgroup") seq_cst, align 4
@@ -949,6 +1164,30 @@ define amdgpu_kernel void @flat_workgroup_monotonic_atomicrmw(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
; GFX940-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_workgroup_monotonic_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_workgroup_monotonic_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("workgroup") monotonic
@@ -1053,6 +1292,34 @@ define amdgpu_kernel void @flat_workgroup_acquire_atomicrmw(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc0
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_workgroup_acquire_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_workgroup_acquire_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("workgroup") acquire
@@ -1154,6 +1421,33 @@ define amdgpu_kernel void @flat_workgroup_release_atomicrmw(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_workgroup_release_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_workgroup_release_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("workgroup") release
@@ -1267,6 +1561,37 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_atomicrmw(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc0
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_workgroup_acq_rel_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_workgroup_acq_rel_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("workgroup") acq_rel
@@ -1380,6 +1705,37 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_atomicrmw(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc0
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_workgroup_seq_cst_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_workgroup_seq_cst_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("workgroup") seq_cst
@@ -1491,6 +1847,35 @@ define amdgpu_kernel void @flat_workgroup_acquire_ret_atomicrmw(
; GFX940-TGSPLIT-NEXT: buffer_inv sc0
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_workgroup_acquire_ret_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_workgroup_acquire_ret_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("workgroup") acquire
@@ -1612,6 +1997,38 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_ret_atomicrmw(
; GFX940-TGSPLIT-NEXT: buffer_inv sc0
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_workgroup_acq_rel_ret_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_workgroup_acq_rel_ret_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("workgroup") acq_rel
@@ -1733,6 +2150,38 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_ret_atomicrmw(
; GFX940-TGSPLIT-NEXT: buffer_inv sc0
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_workgroup_seq_cst_ret_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_workgroup_seq_cst_ret_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("workgroup") seq_cst
@@ -1838,6 +2287,32 @@ define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_cmpxchg(
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_workgroup_monotonic_monotonic_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_workgroup_monotonic_monotonic_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -1955,6 +2430,36 @@ define amdgpu_kernel void @flat_workgroup_acquire_monotonic_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc0
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_workgroup_acquire_monotonic_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_workgroup_acquire_monotonic_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -2069,6 +2574,35 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_workgroup_release_monotonic_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_workgroup_release_monotonic_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -2195,6 +2729,39 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc0
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_workgroup_acq_rel_monotonic_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_workgroup_acq_rel_monotonic_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -2321,6 +2888,39 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc0
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_workgroup_seq_cst_monotonic_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_workgroup_seq_cst_monotonic_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -2438,6 +3038,36 @@ define amdgpu_kernel void @flat_workgroup_monotonic_acquire_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc0
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_workgroup_monotonic_acquire_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_workgroup_monotonic_acquire_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -2555,6 +3185,36 @@ define amdgpu_kernel void @flat_workgroup_acquire_acquire_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc0
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_workgroup_acquire_acquire_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_workgroup_acquire_acquire_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -2681,6 +3341,39 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc0
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_workgroup_release_acquire_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_workgroup_release_acquire_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -2807,6 +3500,39 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc0
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_workgroup_acq_rel_acquire_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_workgroup_acq_rel_acquire_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -2933,6 +3659,39 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc0
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_workgroup_seq_cst_acquire_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_workgroup_seq_cst_acquire_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -3059,6 +3818,39 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc0
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_workgroup_seq_cst_seq_cst_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_workgroup_seq_cst_seq_cst_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -3188,6 +3980,36 @@ define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_workgroup_monotonic_monotonic_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_workgroup_monotonic_monotonic_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -3325,6 +4147,37 @@ define amdgpu_kernel void @flat_workgroup_acquire_monotonic_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: buffer_inv sc0
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_workgroup_acquire_monotonic_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_workgroup_acquire_monotonic_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -3465,6 +4318,39 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_workgroup_release_monotonic_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_workgroup_release_monotonic_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -3611,6 +4497,40 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: buffer_inv sc0
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_workgroup_acq_rel_monotonic_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_workgroup_acq_rel_monotonic_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -3757,6 +4677,40 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: buffer_inv sc0
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_workgroup_seq_cst_monotonic_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_workgroup_seq_cst_monotonic_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -3894,6 +4848,37 @@ define amdgpu_kernel void @flat_workgroup_monotonic_acquire_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: buffer_inv sc0
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_workgroup_monotonic_acquire_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_workgroup_monotonic_acquire_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -4031,6 +5016,37 @@ define amdgpu_kernel void @flat_workgroup_acquire_acquire_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: buffer_inv sc0
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_workgroup_acquire_acquire_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_workgroup_acquire_acquire_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -4177,6 +5193,40 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: buffer_inv sc0
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_workgroup_release_acquire_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_workgroup_release_acquire_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -4323,6 +5373,40 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: buffer_inv sc0
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_workgroup_acq_rel_acquire_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_workgroup_acq_rel_acquire_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -4469,6 +5553,40 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: buffer_inv sc0
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_workgroup_seq_cst_acquire_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_workgroup_seq_cst_acquire_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -4615,6 +5733,40 @@ define amdgpu_kernel void @flat_workgroup_monotonic_seq_cst_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: buffer_inv sc0
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_workgroup_monotonic_seq_cst_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_workgroup_monotonic_seq_cst_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -4761,6 +5913,40 @@ define amdgpu_kernel void @flat_workgroup_acquire_seq_cst_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: buffer_inv sc0
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_workgroup_acquire_seq_cst_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_workgroup_acquire_seq_cst_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -4907,6 +6093,40 @@ define amdgpu_kernel void @flat_workgroup_release_seq_cst_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: buffer_inv sc0
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_workgroup_release_seq_cst_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_workgroup_release_seq_cst_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -5053,6 +6273,40 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_seq_cst_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: buffer_inv sc0
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_workgroup_acq_rel_seq_cst_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_workgroup_acq_rel_seq_cst_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -5199,6 +6453,40 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: buffer_inv sc0
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_workgroup_seq_cst_seq_cst_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_workgroup_seq_cst_seq_cst_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -5312,6 +6600,32 @@ define amdgpu_kernel void @flat_workgroup_one_as_unordered_load(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_workgroup_one_as_unordered_load:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1]
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_workgroup_one_as_unordered_load:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1]
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %in, i32* %out) {
entry:
%val = load atomic i32, i32* %in syncscope("workgroup-one-as") unordered, align 4
@@ -5423,6 +6737,32 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_load(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_workgroup_one_as_monotonic_load:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] glc
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_workgroup_one_as_monotonic_load:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1]
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %in, i32* %out) {
entry:
%val = load atomic i32, i32* %in syncscope("workgroup-one-as") monotonic, align 4
@@ -5538,6 +6878,34 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_load(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s3
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_workgroup_one_as_acquire_load:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_workgroup_one_as_acquire_load:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1]
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %in, i32* %out) {
entry:
%val = load atomic i32, i32* %in syncscope("workgroup-one-as") acquire, align 4
@@ -5657,6 +7025,36 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_load(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s3
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_workgroup_one_as_seq_cst_load:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_workgroup_one_as_seq_cst_load:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1]
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %in, i32* %out) {
entry:
%val = load atomic i32, i32* %in syncscope("workgroup-one-as") seq_cst, align 4
@@ -5750,6 +7148,30 @@ define amdgpu_kernel void @flat_workgroup_one_as_unordered_store(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_workgroup_one_as_unordered_store:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_workgroup_one_as_unordered_store:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32 %in, i32* %out) {
entry:
store atomic i32 %in, i32* %out syncscope("workgroup-one-as") unordered, align 4
@@ -5842,6 +7264,30 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_store(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_workgroup_one_as_monotonic_store:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_workgroup_one_as_monotonic_store:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32 %in, i32* %out) {
entry:
store atomic i32 %in, i32* %out syncscope("workgroup-one-as") monotonic, align 4
@@ -5938,6 +7384,32 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_store(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_workgroup_one_as_release_store:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_workgroup_one_as_release_store:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32 %in, i32* %out) {
entry:
store atomic i32 %in, i32* %out syncscope("workgroup-one-as") release, align 4
@@ -6034,6 +7506,32 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_store(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_workgroup_one_as_seq_cst_store:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_workgroup_one_as_seq_cst_store:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32 %in, i32* %out) {
entry:
store atomic i32 %in, i32* %out syncscope("workgroup-one-as") seq_cst, align 4
@@ -6126,6 +7624,30 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_atomicrmw(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
; GFX940-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_workgroup_one_as_monotonic_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_workgroup_one_as_monotonic_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("workgroup-one-as") monotonic
@@ -6224,6 +7746,32 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_atomicrmw(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc0
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_workgroup_one_as_acquire_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_workgroup_one_as_acquire_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("workgroup-one-as") acquire
@@ -6320,6 +7868,32 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_atomicrmw(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_workgroup_one_as_release_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_workgroup_one_as_release_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("workgroup-one-as") release
@@ -6422,6 +7996,34 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_atomicrmw(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc0
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_workgroup_one_as_acq_rel_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_workgroup_one_as_acq_rel_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("workgroup-one-as") acq_rel
@@ -6524,6 +8126,34 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_atomicrmw(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc0
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_workgroup_one_as_seq_cst_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_workgroup_one_as_seq_cst_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("workgroup-one-as") seq_cst
@@ -6636,6 +8266,36 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_ret_atomicrmw(
; GFX940-TGSPLIT-NEXT: buffer_inv sc0
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_workgroup_one_as_acquire_ret_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_workgroup_one_as_acquire_ret_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("workgroup-one-as") acquire
@@ -6753,6 +8413,38 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_ret_atomicrmw(
; GFX940-TGSPLIT-NEXT: buffer_inv sc0
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_workgroup_one_as_acq_rel_ret_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_workgroup_one_as_acq_rel_ret_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("workgroup-one-as") acq_rel
@@ -6870,6 +8562,38 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_ret_atomicrmw(
; GFX940-TGSPLIT-NEXT: buffer_inv sc0
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_workgroup_one_as_seq_cst_ret_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_workgroup_one_as_seq_cst_ret_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32* %out, i32 %in syncscope("workgroup-one-as") seq_cst
@@ -6975,6 +8699,32 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_monotonic_cmpxchg(
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_workgroup_one_as_monotonic_monotonic_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_workgroup_one_as_monotonic_monotonic_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -7086,6 +8836,34 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc0
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_workgroup_one_as_acquire_monotonic_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_workgroup_one_as_acquire_monotonic_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -7195,6 +8973,34 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_workgroup_one_as_release_monotonic_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_workgroup_one_as_release_monotonic_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -7310,6 +9116,36 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc0
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_workgroup_one_as_acq_rel_monotonic_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_workgroup_one_as_acq_rel_monotonic_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -7425,6 +9261,36 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc0
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_workgroup_one_as_seq_cst_monotonic_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_workgroup_one_as_seq_cst_monotonic_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -7536,6 +9402,34 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc0
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_workgroup_one_as_monotonic_acquire_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_workgroup_one_as_monotonic_acquire_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -7647,6 +9541,34 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc0
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_workgroup_one_as_acquire_acquire_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_workgroup_one_as_acquire_acquire_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -7762,6 +9684,36 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc0
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_workgroup_one_as_release_acquire_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_workgroup_one_as_release_acquire_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -7877,6 +9829,36 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc0
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_workgroup_one_as_acq_rel_acquire_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_workgroup_one_as_acq_rel_acquire_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -7992,6 +9974,36 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc0
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_workgroup_one_as_seq_cst_acquire_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_workgroup_one_as_seq_cst_acquire_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -8107,6 +10119,36 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc0
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_workgroup_one_as_monotonic_seq_cst_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_workgroup_one_as_monotonic_seq_cst_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -8222,6 +10264,36 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc0
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_workgroup_one_as_acquire_seq_cst_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_workgroup_one_as_acquire_seq_cst_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -8337,6 +10409,36 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc0
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_workgroup_one_as_release_seq_cst_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_workgroup_one_as_release_seq_cst_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -8452,6 +10554,36 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc0
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -8567,6 +10699,36 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc0
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -8696,6 +10858,36 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -8831,6 +11023,38 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: buffer_inv sc0
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -8966,6 +11190,38 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_workgroup_one_as_release_monotonic_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_workgroup_one_as_release_monotonic_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -9105,6 +11361,40 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: buffer_inv sc0
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -9244,6 +11534,40 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: buffer_inv sc0
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -9379,6 +11703,38 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: buffer_inv sc0
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -9514,6 +11870,38 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: buffer_inv sc0
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_workgroup_one_as_acquire_acquire_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_workgroup_one_as_acquire_acquire_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -9653,6 +12041,40 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: buffer_inv sc0
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_workgroup_one_as_release_acquire_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_workgroup_one_as_release_acquire_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -9792,6 +12214,40 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: buffer_inv sc0
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -9931,6 +12387,40 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: buffer_inv sc0
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -10070,6 +12560,40 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: buffer_inv sc0
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -10209,6 +12733,40 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: buffer_inv sc0
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -10348,6 +12906,40 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: buffer_inv sc0
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_workgroup_one_as_release_seq_cst_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_workgroup_one_as_release_seq_cst_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -10487,6 +13079,40 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: buffer_inv sc0
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
@@ -10626,6 +13252,40 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: buffer_inv sc0
; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-CU-NEXT: s_endpgm
i32* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32* %out, i32 4
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll
index 8537aa5b4a30..7a18ad0e52ea 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll
@@ -8,6 +8,8 @@
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=+tgsplit -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-TGSPLIT %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-NOTTGSPLIT %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -mattr=+tgsplit -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-TGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11-WGP %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+cumode -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11-CU %s
define amdgpu_kernel void @global_agent_unordered_load(
; GFX6-LABEL: global_agent_unordered_load:
@@ -112,6 +114,26 @@ define amdgpu_kernel void @global_agent_unordered_load(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_agent_unordered_load:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[0:1]
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[2:3]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_agent_unordered_load:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: global_load_b32 v1, v0, s[0:1]
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: global_store_b32 v0, v1, s[2:3]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
entry:
%val = load atomic i32, i32 addrspace(1)* %in syncscope("agent") unordered, align 4
@@ -222,6 +244,26 @@ define amdgpu_kernel void @global_agent_monotonic_load(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_agent_monotonic_load:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[0:1] glc dlc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[2:3]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_agent_monotonic_load:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: global_load_b32 v1, v0, s[0:1] glc dlc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: global_store_b32 v0, v1, s[2:3]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
entry:
%val = load atomic i32, i32 addrspace(1)* %in syncscope("agent") monotonic, align 4
@@ -342,6 +384,30 @@ define amdgpu_kernel void @global_agent_acquire_load(
; GFX940-TGSPLIT-NEXT: buffer_inv sc1
; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_agent_acquire_load:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[0:1] glc dlc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[2:3]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_agent_acquire_load:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: global_load_b32 v1, v0, s[0:1] glc dlc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: global_store_b32 v0, v1, s[2:3]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
entry:
%val = load atomic i32, i32 addrspace(1)* %in syncscope("agent") acquire, align 4
@@ -467,6 +533,32 @@ define amdgpu_kernel void @global_agent_seq_cst_load(
; GFX940-TGSPLIT-NEXT: buffer_inv sc1
; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_agent_seq_cst_load:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[0:1] glc dlc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[2:3]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_agent_seq_cst_load:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: global_load_b32 v1, v0, s[0:1] glc dlc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: global_store_b32 v0, v1, s[2:3]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
entry:
%val = load atomic i32, i32 addrspace(1)* %in syncscope("agent") seq_cst, align 4
@@ -569,6 +661,28 @@ define amdgpu_kernel void @global_agent_unordered_store(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_agent_unordered_store:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_agent_unordered_store:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 %in, i32 addrspace(1)* %out) {
entry:
store atomic i32 %in, i32 addrspace(1)* %out syncscope("agent") unordered, align 4
@@ -670,6 +784,28 @@ define amdgpu_kernel void @global_agent_monotonic_store(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_agent_monotonic_store:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_agent_monotonic_store:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 %in, i32 addrspace(1)* %out) {
entry:
store atomic i32 %in, i32 addrspace(1)* %out syncscope("agent") monotonic, align 4
@@ -784,6 +920,32 @@ define amdgpu_kernel void @global_agent_release_store(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_agent_release_store:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_agent_release_store:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 %in, i32 addrspace(1)* %out) {
entry:
store atomic i32 %in, i32 addrspace(1)* %out syncscope("agent") release, align 4
@@ -898,6 +1060,32 @@ define amdgpu_kernel void @global_agent_seq_cst_store(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_agent_seq_cst_store:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_agent_seq_cst_store:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 %in, i32 addrspace(1)* %out) {
entry:
store atomic i32 %in, i32 addrspace(1)* %out syncscope("agent") seq_cst, align 4
@@ -999,6 +1187,28 @@ define amdgpu_kernel void @global_agent_monotonic_atomicrmw(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
; GFX940-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[2:3]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_agent_monotonic_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_agent_monotonic_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("agent") monotonic
@@ -1119,6 +1329,34 @@ define amdgpu_kernel void @global_agent_acquire_atomicrmw(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_agent_acquire_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_agent_acquire_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("agent") acquire
@@ -1233,6 +1471,32 @@ define amdgpu_kernel void @global_agent_release_atomicrmw(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[2:3]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_agent_release_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_agent_release_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("agent") release
@@ -1366,6 +1630,38 @@ define amdgpu_kernel void @global_agent_acq_rel_atomicrmw(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_agent_acq_rel_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_agent_acq_rel_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("agent") acq_rel
@@ -1499,6 +1795,38 @@ define amdgpu_kernel void @global_agent_seq_cst_atomicrmw(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_agent_seq_cst_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_agent_seq_cst_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("agent") seq_cst
@@ -1628,6 +1956,36 @@ define amdgpu_kernel void @global_agent_acquire_ret_atomicrmw(
; GFX940-TGSPLIT-NEXT: buffer_inv sc1
; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_agent_acquire_ret_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_agent_acquire_ret_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("agent") acquire
@@ -1771,6 +2129,40 @@ define amdgpu_kernel void @global_agent_acq_rel_ret_atomicrmw(
; GFX940-TGSPLIT-NEXT: buffer_inv sc1
; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_agent_acq_rel_ret_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_agent_acq_rel_ret_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("agent") acq_rel
@@ -1914,6 +2306,40 @@ define amdgpu_kernel void @global_agent_seq_cst_ret_atomicrmw(
; GFX940-TGSPLIT-NEXT: buffer_inv sc1
; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_agent_seq_cst_ret_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_agent_seq_cst_ret_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("agent") seq_cst
@@ -2023,6 +2449,30 @@ define amdgpu_kernel void @global_agent_monotonic_monotonic_cmpxchg(
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_agent_monotonic_monotonic_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_agent_monotonic_monotonic_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -2151,6 +2601,36 @@ define amdgpu_kernel void @global_agent_acquire_monotonic_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_agent_acquire_monotonic_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_agent_acquire_monotonic_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -2273,6 +2753,34 @@ define amdgpu_kernel void @global_agent_release_monotonic_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_agent_release_monotonic_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_agent_release_monotonic_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -2414,6 +2922,40 @@ define amdgpu_kernel void @global_agent_acq_rel_monotonic_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_agent_acq_rel_monotonic_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_agent_acq_rel_monotonic_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -2555,6 +3097,40 @@ define amdgpu_kernel void @global_agent_seq_cst_monotonic_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_agent_seq_cst_monotonic_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_agent_seq_cst_monotonic_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -2683,6 +3259,36 @@ define amdgpu_kernel void @global_agent_monotonic_acquire_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_agent_monotonic_acquire_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_agent_monotonic_acquire_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -2811,6 +3417,36 @@ define amdgpu_kernel void @global_agent_acquire_acquire_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_agent_acquire_acquire_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_agent_acquire_acquire_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -2952,6 +3588,40 @@ define amdgpu_kernel void @global_agent_release_acquire_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_agent_release_acquire_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_agent_release_acquire_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -3093,6 +3763,40 @@ define amdgpu_kernel void @global_agent_acq_rel_acquire_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_agent_acq_rel_acquire_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_agent_acq_rel_acquire_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -3234,6 +3938,40 @@ define amdgpu_kernel void @global_agent_seq_cst_acquire_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_agent_seq_cst_acquire_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_agent_seq_cst_acquire_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -3375,6 +4113,40 @@ define amdgpu_kernel void @global_agent_monotonic_seq_cst_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_agent_monotonic_seq_cst_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_agent_monotonic_seq_cst_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -3516,6 +4288,40 @@ define amdgpu_kernel void @global_agent_acquire_seq_cst_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_agent_acquire_seq_cst_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_agent_acquire_seq_cst_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -3657,6 +4463,40 @@ define amdgpu_kernel void @global_agent_release_seq_cst_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_agent_release_seq_cst_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_agent_release_seq_cst_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -3798,6 +4638,40 @@ define amdgpu_kernel void @global_agent_acq_rel_seq_cst_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_agent_acq_rel_seq_cst_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_agent_acq_rel_seq_cst_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -3939,6 +4813,40 @@ define amdgpu_kernel void @global_agent_seq_cst_seq_cst_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_agent_seq_cst_seq_cst_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_agent_seq_cst_seq_cst_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -4068,6 +4976,34 @@ define amdgpu_kernel void @global_agent_monotonic_monotonic_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_agent_monotonic_monotonic_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_agent_monotonic_monotonic_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -4209,6 +5145,38 @@ define amdgpu_kernel void @global_agent_acquire_monotonic_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: buffer_inv sc1
; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_agent_acquire_monotonic_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_agent_acquire_monotonic_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -4353,6 +5321,38 @@ define amdgpu_kernel void @global_agent_release_monotonic_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_agent_release_monotonic_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_agent_release_monotonic_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -4507,6 +5507,42 @@ define amdgpu_kernel void @global_agent_acq_rel_monotonic_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: buffer_inv sc1
; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_agent_acq_rel_monotonic_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_agent_acq_rel_monotonic_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -4661,6 +5697,42 @@ define amdgpu_kernel void @global_agent_seq_cst_monotonic_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: buffer_inv sc1
; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_agent_seq_cst_monotonic_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_agent_seq_cst_monotonic_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -4802,6 +5874,38 @@ define amdgpu_kernel void @global_agent_monotonic_acquire_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: buffer_inv sc1
; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_agent_monotonic_acquire_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_agent_monotonic_acquire_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -4943,6 +6047,38 @@ define amdgpu_kernel void @global_agent_acquire_acquire_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: buffer_inv sc1
; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_agent_acquire_acquire_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_agent_acquire_acquire_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -5097,6 +6233,42 @@ define amdgpu_kernel void @global_agent_release_acquire_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: buffer_inv sc1
; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_agent_release_acquire_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_agent_release_acquire_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -5251,6 +6423,42 @@ define amdgpu_kernel void @global_agent_acq_rel_acquire_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: buffer_inv sc1
; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_agent_acq_rel_acquire_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_agent_acq_rel_acquire_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -5405,6 +6613,42 @@ define amdgpu_kernel void @global_agent_seq_cst_acquire_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: buffer_inv sc1
; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_agent_seq_cst_acquire_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_agent_seq_cst_acquire_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -5559,6 +6803,42 @@ define amdgpu_kernel void @global_agent_monotonic_seq_cst_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: buffer_inv sc1
; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_agent_monotonic_seq_cst_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_agent_monotonic_seq_cst_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -5713,6 +6993,42 @@ define amdgpu_kernel void @global_agent_acquire_seq_cst_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: buffer_inv sc1
; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_agent_acquire_seq_cst_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_agent_acquire_seq_cst_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -5867,6 +7183,42 @@ define amdgpu_kernel void @global_agent_release_seq_cst_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: buffer_inv sc1
; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_agent_release_seq_cst_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_agent_release_seq_cst_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -6021,6 +7373,42 @@ define amdgpu_kernel void @global_agent_acq_rel_seq_cst_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: buffer_inv sc1
; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_agent_acq_rel_seq_cst_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_agent_acq_rel_seq_cst_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -6175,6 +7563,42 @@ define amdgpu_kernel void @global_agent_seq_cst_seq_cst_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: buffer_inv sc1
; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_agent_seq_cst_seq_cst_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_agent_seq_cst_seq_cst_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -6287,6 +7711,26 @@ define amdgpu_kernel void @global_agent_one_as_unordered_load(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_agent_one_as_unordered_load:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[0:1]
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[2:3]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_agent_one_as_unordered_load:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: global_load_b32 v1, v0, s[0:1]
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: global_store_b32 v0, v1, s[2:3]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
entry:
%val = load atomic i32, i32 addrspace(1)* %in syncscope("agent-one-as") unordered, align 4
@@ -6397,6 +7841,26 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_load(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_agent_one_as_monotonic_load:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[0:1] glc dlc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[2:3]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_agent_one_as_monotonic_load:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: global_load_b32 v1, v0, s[0:1] glc dlc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: global_store_b32 v0, v1, s[2:3]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
entry:
%val = load atomic i32, i32 addrspace(1)* %in syncscope("agent-one-as") monotonic, align 4
@@ -6517,6 +7981,30 @@ define amdgpu_kernel void @global_agent_one_as_acquire_load(
; GFX940-TGSPLIT-NEXT: buffer_inv sc1
; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_agent_one_as_acquire_load:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[0:1] glc dlc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[2:3]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_agent_one_as_acquire_load:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: global_load_b32 v1, v0, s[0:1] glc dlc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: global_store_b32 v0, v1, s[2:3]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
entry:
%val = load atomic i32, i32 addrspace(1)* %in syncscope("agent-one-as") acquire, align 4
@@ -6642,6 +8130,32 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_load(
; GFX940-TGSPLIT-NEXT: buffer_inv sc1
; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_agent_one_as_seq_cst_load:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[0:1] glc dlc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[2:3]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_agent_one_as_seq_cst_load:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: global_load_b32 v1, v0, s[0:1] glc dlc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: global_store_b32 v0, v1, s[2:3]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
entry:
%val = load atomic i32, i32 addrspace(1)* %in syncscope("agent-one-as") seq_cst, align 4
@@ -6744,6 +8258,28 @@ define amdgpu_kernel void @global_agent_one_as_unordered_store(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_agent_one_as_unordered_store:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_agent_one_as_unordered_store:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 %in, i32 addrspace(1)* %out) {
entry:
store atomic i32 %in, i32 addrspace(1)* %out syncscope("agent-one-as") unordered, align 4
@@ -6845,6 +8381,28 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_store(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_agent_one_as_monotonic_store:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_agent_one_as_monotonic_store:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 %in, i32 addrspace(1)* %out) {
entry:
store atomic i32 %in, i32 addrspace(1)* %out syncscope("agent-one-as") monotonic, align 4
@@ -6959,6 +8517,32 @@ define amdgpu_kernel void @global_agent_one_as_release_store(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_agent_one_as_release_store:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_agent_one_as_release_store:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 %in, i32 addrspace(1)* %out) {
entry:
store atomic i32 %in, i32 addrspace(1)* %out syncscope("agent-one-as") release, align 4
@@ -7073,6 +8657,32 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_store(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_agent_one_as_seq_cst_store:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_agent_one_as_seq_cst_store:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 %in, i32 addrspace(1)* %out) {
entry:
store atomic i32 %in, i32 addrspace(1)* %out syncscope("agent-one-as") seq_cst, align 4
@@ -7174,6 +8784,28 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_atomicrmw(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
; GFX940-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[2:3]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_agent_one_as_monotonic_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_agent_one_as_monotonic_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("agent-one-as") monotonic
@@ -7294,6 +8926,34 @@ define amdgpu_kernel void @global_agent_one_as_acquire_atomicrmw(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_agent_one_as_acquire_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_agent_one_as_acquire_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("agent-one-as") acquire
@@ -7408,6 +9068,32 @@ define amdgpu_kernel void @global_agent_one_as_release_atomicrmw(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[2:3]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_agent_one_as_release_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_agent_one_as_release_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("agent-one-as") release
@@ -7541,6 +9227,38 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_atomicrmw(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_agent_one_as_acq_rel_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_agent_one_as_acq_rel_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("agent-one-as") acq_rel
@@ -7674,6 +9392,38 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_atomicrmw(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_agent_one_as_seq_cst_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_agent_one_as_seq_cst_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("agent-one-as") seq_cst
@@ -7803,6 +9553,36 @@ define amdgpu_kernel void @global_agent_one_as_acquire_ret_atomicrmw(
; GFX940-TGSPLIT-NEXT: buffer_inv sc1
; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_agent_one_as_acquire_ret_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_agent_one_as_acquire_ret_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("agent-one-as") acquire
@@ -7946,6 +9726,40 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_ret_atomicrmw(
; GFX940-TGSPLIT-NEXT: buffer_inv sc1
; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_agent_one_as_acq_rel_ret_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_agent_one_as_acq_rel_ret_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("agent-one-as") acq_rel
@@ -8089,6 +9903,40 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_ret_atomicrmw(
; GFX940-TGSPLIT-NEXT: buffer_inv sc1
; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_agent_one_as_seq_cst_ret_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_agent_one_as_seq_cst_ret_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("agent-one-as") seq_cst
@@ -8198,6 +10046,30 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_monotonic_cmpxchg(
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_agent_one_as_monotonic_monotonic_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_agent_one_as_monotonic_monotonic_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -8326,6 +10198,36 @@ define amdgpu_kernel void @global_agent_one_as_acquire_monotonic_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_agent_one_as_acquire_monotonic_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_agent_one_as_acquire_monotonic_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -8448,6 +10350,34 @@ define amdgpu_kernel void @global_agent_one_as_release_monotonic_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_agent_one_as_release_monotonic_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_agent_one_as_release_monotonic_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -8589,6 +10519,40 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_monotonic_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_agent_one_as_acq_rel_monotonic_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_agent_one_as_acq_rel_monotonic_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -8730,6 +10694,40 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_monotonic_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_agent_one_as_seq_cst_monotonic_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_agent_one_as_seq_cst_monotonic_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -8858,6 +10856,36 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_acquire_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_agent_one_as_monotonic_acquire_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_agent_one_as_monotonic_acquire_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -8986,6 +11014,36 @@ define amdgpu_kernel void @global_agent_one_as_acquire_acquire_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_agent_one_as_acquire_acquire_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_agent_one_as_acquire_acquire_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -9127,6 +11185,40 @@ define amdgpu_kernel void @global_agent_one_as_release_acquire_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_agent_one_as_release_acquire_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_agent_one_as_release_acquire_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -9268,6 +11360,40 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_acquire_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_agent_one_as_acq_rel_acquire_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_agent_one_as_acq_rel_acquire_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -9409,6 +11535,40 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_acquire_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_agent_one_as_seq_cst_acquire_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_agent_one_as_seq_cst_acquire_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -9550,6 +11710,40 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_seq_cst_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_agent_one_as_monotonic_seq_cst_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_agent_one_as_monotonic_seq_cst_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -9691,6 +11885,40 @@ define amdgpu_kernel void @global_agent_one_as_acquire_seq_cst_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_agent_one_as_acquire_seq_cst_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_agent_one_as_acquire_seq_cst_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -9832,6 +12060,40 @@ define amdgpu_kernel void @global_agent_one_as_release_seq_cst_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_agent_one_as_release_seq_cst_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_agent_one_as_release_seq_cst_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -9973,6 +12235,40 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_seq_cst_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_agent_one_as_acq_rel_seq_cst_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_agent_one_as_acq_rel_seq_cst_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -10114,6 +12410,40 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_seq_cst_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_agent_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_agent_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -10243,6 +12573,34 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_monotonic_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_agent_one_as_monotonic_monotonic_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_agent_one_as_monotonic_monotonic_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -10384,6 +12742,38 @@ define amdgpu_kernel void @global_agent_one_as_acquire_monotonic_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: buffer_inv sc1
; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_agent_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_agent_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -10538,6 +12928,42 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_monotonic_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: buffer_inv sc1
; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_agent_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_agent_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -10692,6 +13118,42 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_monotonic_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: buffer_inv sc1
; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_agent_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_agent_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -10833,6 +13295,38 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_acquire_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: buffer_inv sc1
; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_agent_one_as_monotonic_acquire_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_agent_one_as_monotonic_acquire_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -10974,6 +13468,38 @@ define amdgpu_kernel void @global_agent_one_as_acquire_acquire_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: buffer_inv sc1
; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_agent_one_as_acquire_acquire_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_agent_one_as_acquire_acquire_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -11128,6 +13654,42 @@ define amdgpu_kernel void @global_agent_one_as_release_acquire_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: buffer_inv sc1
; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_agent_one_as_release_acquire_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_agent_one_as_release_acquire_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -11282,6 +13844,42 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_acquire_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: buffer_inv sc1
; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_agent_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_agent_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -11436,6 +14034,42 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_acquire_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: buffer_inv sc1
; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_agent_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_agent_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -11590,6 +14224,42 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_seq_cst_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: buffer_inv sc1
; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_agent_one_as_monotonic_seq_cst_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_agent_one_as_monotonic_seq_cst_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -11744,6 +14414,42 @@ define amdgpu_kernel void @global_agent_one_as_acquire_seq_cst_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: buffer_inv sc1
; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_agent_one_as_acquire_seq_cst_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_agent_one_as_acquire_seq_cst_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -11898,6 +14604,42 @@ define amdgpu_kernel void @global_agent_one_as_release_seq_cst_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: buffer_inv sc1
; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_agent_one_as_release_seq_cst_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_agent_one_as_release_seq_cst_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -12052,6 +14794,42 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_seq_cst_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: buffer_inv sc1
; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_agent_one_as_acq_rel_seq_cst_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_agent_one_as_acq_rel_seq_cst_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -12206,6 +14984,42 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_seq_cst_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: buffer_inv sc1
; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_agent_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_agent_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-nontemporal.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-nontemporal.ll
index 38062a8ed0f8..33fc874dc6f7 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-nontemporal.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-nontemporal.ll
@@ -8,6 +8,8 @@
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=+tgsplit -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-TGSPLIT %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-NOTTGSPLIT %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -mattr=+tgsplit -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-TGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11-WGP %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+cumode -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11-CU %s
define amdgpu_kernel void @global_nontemporal_load_0(
; GFX6-LABEL: global_nontemporal_load_0:
@@ -115,6 +117,28 @@ define amdgpu_kernel void @global_nontemporal_load_0(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0
; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_nontemporal_load_0:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0
+; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[2:3]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_nontemporal_load_0:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0
+; GFX11-CU-NEXT: global_store_b32 v0, v1, s[2:3]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
entry:
%val = load i32, i32 addrspace(1)* %in, align 4, !nontemporal !0
@@ -237,6 +261,28 @@ define amdgpu_kernel void @global_nontemporal_load_1(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: global_store_dword v1, v0, s[2:3]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_nontemporal_load_1:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: global_load_b32 v0, v0, s[0:1] slc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: global_store_b32 v1, v0, s[2:3]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_nontemporal_load_1:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: global_load_b32 v0, v0, s[0:1] slc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: global_store_b32 v1, v0, s[2:3]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -352,6 +398,28 @@ define amdgpu_kernel void @global_nontemporal_store_0(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0
; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] nt
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_nontemporal_store_0:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0
+; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[2:3] glc slc
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_nontemporal_store_0:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0
+; GFX11-CU-NEXT: global_store_b32 v0, v1, s[2:3] glc slc
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
entry:
%val = load i32, i32 addrspace(1)* %in, align 4
@@ -469,6 +537,28 @@ define amdgpu_kernel void @global_nontemporal_store_1(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0
; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] nt
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_nontemporal_store_1:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0
+; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[2:3] glc slc
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_nontemporal_store_1:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0
+; GFX11-CU-NEXT: global_store_b32 v0, v1, s[2:3] glc slc
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-singlethread.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-singlethread.ll
index d09b06748fef..b89c91a3cf28 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-singlethread.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-singlethread.ll
@@ -8,6 +8,8 @@
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=+tgsplit -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-TGSPLIT %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-NOTTGSPLIT %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -mattr=+tgsplit -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-TGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11-WGP %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+cumode -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11-CU %s
define amdgpu_kernel void @global_singlethread_unordered_load(
; GFX6-LABEL: global_singlethread_unordered_load:
@@ -112,6 +114,26 @@ define amdgpu_kernel void @global_singlethread_unordered_load(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_singlethread_unordered_load:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[0:1]
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[2:3]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_singlethread_unordered_load:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: global_load_b32 v1, v0, s[0:1]
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: global_store_b32 v0, v1, s[2:3]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
entry:
%val = load atomic i32, i32 addrspace(1)* %in syncscope("singlethread") unordered, align 4
@@ -222,6 +244,26 @@ define amdgpu_kernel void @global_singlethread_monotonic_load(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_singlethread_monotonic_load:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[0:1]
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[2:3]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_singlethread_monotonic_load:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: global_load_b32 v1, v0, s[0:1]
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: global_store_b32 v0, v1, s[2:3]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
entry:
%val = load atomic i32, i32 addrspace(1)* %in syncscope("singlethread") monotonic, align 4
@@ -332,6 +374,26 @@ define amdgpu_kernel void @global_singlethread_acquire_load(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_singlethread_acquire_load:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[0:1]
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[2:3]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_singlethread_acquire_load:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: global_load_b32 v1, v0, s[0:1]
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: global_store_b32 v0, v1, s[2:3]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
entry:
%val = load atomic i32, i32 addrspace(1)* %in syncscope("singlethread") acquire, align 4
@@ -442,6 +504,26 @@ define amdgpu_kernel void @global_singlethread_seq_cst_load(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_singlethread_seq_cst_load:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[0:1]
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[2:3]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_singlethread_seq_cst_load:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: global_load_b32 v1, v0, s[0:1]
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: global_store_b32 v0, v1, s[2:3]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
entry:
%val = load atomic i32, i32 addrspace(1)* %in syncscope("singlethread") seq_cst, align 4
@@ -544,6 +626,28 @@ define amdgpu_kernel void @global_singlethread_unordered_store(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_singlethread_unordered_store:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_singlethread_unordered_store:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 %in, i32 addrspace(1)* %out) {
entry:
store atomic i32 %in, i32 addrspace(1)* %out syncscope("singlethread") unordered, align 4
@@ -645,6 +749,28 @@ define amdgpu_kernel void @global_singlethread_monotonic_store(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_singlethread_monotonic_store:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_singlethread_monotonic_store:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 %in, i32 addrspace(1)* %out) {
entry:
store atomic i32 %in, i32 addrspace(1)* %out syncscope("singlethread") monotonic, align 4
@@ -746,6 +872,28 @@ define amdgpu_kernel void @global_singlethread_release_store(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_singlethread_release_store:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_singlethread_release_store:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 %in, i32 addrspace(1)* %out) {
entry:
store atomic i32 %in, i32 addrspace(1)* %out syncscope("singlethread") release, align 4
@@ -847,6 +995,28 @@ define amdgpu_kernel void @global_singlethread_seq_cst_store(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_singlethread_seq_cst_store:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_singlethread_seq_cst_store:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 %in, i32 addrspace(1)* %out) {
entry:
store atomic i32 %in, i32 addrspace(1)* %out syncscope("singlethread") seq_cst, align 4
@@ -948,6 +1118,28 @@ define amdgpu_kernel void @global_singlethread_monotonic_atomicrmw(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
; GFX940-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[2:3]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_singlethread_monotonic_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_singlethread_monotonic_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("singlethread") monotonic
@@ -1049,6 +1241,28 @@ define amdgpu_kernel void @global_singlethread_acquire_atomicrmw(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
; GFX940-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[2:3]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_singlethread_acquire_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_singlethread_acquire_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("singlethread") acquire
@@ -1150,6 +1364,28 @@ define amdgpu_kernel void @global_singlethread_release_atomicrmw(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
; GFX940-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[2:3]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_singlethread_release_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_singlethread_release_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("singlethread") release
@@ -1251,6 +1487,28 @@ define amdgpu_kernel void @global_singlethread_acq_rel_atomicrmw(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
; GFX940-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[2:3]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_singlethread_acq_rel_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_singlethread_acq_rel_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("singlethread") acq_rel
@@ -1352,6 +1610,28 @@ define amdgpu_kernel void @global_singlethread_seq_cst_atomicrmw(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
; GFX940-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[2:3]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_singlethread_seq_cst_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_singlethread_seq_cst_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("singlethread") seq_cst
@@ -1471,6 +1751,32 @@ define amdgpu_kernel void @global_singlethread_acquire_ret_atomicrmw(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_singlethread_acquire_ret_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_singlethread_acquire_ret_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("singlethread") acquire
@@ -1591,6 +1897,32 @@ define amdgpu_kernel void @global_singlethread_acq_rel_ret_atomicrmw(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_singlethread_acq_rel_ret_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_singlethread_acq_rel_ret_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("singlethread") acq_rel
@@ -1711,6 +2043,32 @@ define amdgpu_kernel void @global_singlethread_seq_cst_ret_atomicrmw(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_singlethread_seq_cst_ret_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_singlethread_seq_cst_ret_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("singlethread") seq_cst
@@ -1820,6 +2178,30 @@ define amdgpu_kernel void @global_singlethread_monotonic_monotonic_cmpxchg(
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_singlethread_monotonic_monotonic_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_singlethread_monotonic_monotonic_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -1929,6 +2311,30 @@ define amdgpu_kernel void @global_singlethread_acquire_monotonic_cmpxchg(
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_singlethread_acquire_monotonic_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_singlethread_acquire_monotonic_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -2038,6 +2444,30 @@ define amdgpu_kernel void @global_singlethread_release_monotonic_cmpxchg(
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_singlethread_release_monotonic_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_singlethread_release_monotonic_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -2147,6 +2577,30 @@ define amdgpu_kernel void @global_singlethread_acq_rel_monotonic_cmpxchg(
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_singlethread_acq_rel_monotonic_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_singlethread_acq_rel_monotonic_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -2256,6 +2710,30 @@ define amdgpu_kernel void @global_singlethread_seq_cst_monotonic_cmpxchg(
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_singlethread_seq_cst_monotonic_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_singlethread_seq_cst_monotonic_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -2365,6 +2843,30 @@ define amdgpu_kernel void @global_singlethread_monotonic_acquire_cmpxchg(
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_singlethread_monotonic_acquire_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_singlethread_monotonic_acquire_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -2474,6 +2976,30 @@ define amdgpu_kernel void @global_singlethread_acquire_acquire_cmpxchg(
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_singlethread_acquire_acquire_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_singlethread_acquire_acquire_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -2583,6 +3109,30 @@ define amdgpu_kernel void @global_singlethread_release_acquire_cmpxchg(
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_singlethread_release_acquire_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_singlethread_release_acquire_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -2692,6 +3242,30 @@ define amdgpu_kernel void @global_singlethread_acq_rel_acquire_cmpxchg(
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_singlethread_acq_rel_acquire_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_singlethread_acq_rel_acquire_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -2801,6 +3375,30 @@ define amdgpu_kernel void @global_singlethread_seq_cst_acquire_cmpxchg(
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_singlethread_seq_cst_acquire_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_singlethread_seq_cst_acquire_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -2910,6 +3508,30 @@ define amdgpu_kernel void @global_singlethread_monotonic_seq_cst_cmpxchg(
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_singlethread_monotonic_seq_cst_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_singlethread_monotonic_seq_cst_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -3019,6 +3641,30 @@ define amdgpu_kernel void @global_singlethread_acquire_seq_cst_cmpxchg(
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_singlethread_acquire_seq_cst_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_singlethread_acquire_seq_cst_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -3128,6 +3774,30 @@ define amdgpu_kernel void @global_singlethread_release_seq_cst_cmpxchg(
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_singlethread_release_seq_cst_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_singlethread_release_seq_cst_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -3237,6 +3907,30 @@ define amdgpu_kernel void @global_singlethread_acq_rel_seq_cst_cmpxchg(
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_singlethread_acq_rel_seq_cst_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_singlethread_acq_rel_seq_cst_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -3346,6 +4040,30 @@ define amdgpu_kernel void @global_singlethread_seq_cst_seq_cst_cmpxchg(
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_singlethread_seq_cst_seq_cst_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_singlethread_seq_cst_seq_cst_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -3475,6 +4193,34 @@ define amdgpu_kernel void @global_singlethread_monotonic_monotonic_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_singlethread_monotonic_monotonic_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_singlethread_monotonic_monotonic_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -3606,6 +4352,34 @@ define amdgpu_kernel void @global_singlethread_acquire_monotonic_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_singlethread_acquire_monotonic_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_singlethread_acquire_monotonic_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -3737,6 +4511,34 @@ define amdgpu_kernel void @global_singlethread_release_monotonic_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_singlethread_release_monotonic_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_singlethread_release_monotonic_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -3868,6 +4670,34 @@ define amdgpu_kernel void @global_singlethread_acq_rel_monotonic_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_singlethread_acq_rel_monotonic_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_singlethread_acq_rel_monotonic_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -3999,6 +4829,34 @@ define amdgpu_kernel void @global_singlethread_seq_cst_monotonic_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_singlethread_seq_cst_monotonic_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_singlethread_seq_cst_monotonic_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -4130,6 +4988,34 @@ define amdgpu_kernel void @global_singlethread_monotonic_acquire_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_singlethread_monotonic_acquire_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_singlethread_monotonic_acquire_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -4261,6 +5147,34 @@ define amdgpu_kernel void @global_singlethread_acquire_acquire_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_singlethread_acquire_acquire_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_singlethread_acquire_acquire_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -4392,6 +5306,34 @@ define amdgpu_kernel void @global_singlethread_release_acquire_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_singlethread_release_acquire_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_singlethread_release_acquire_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -4523,6 +5465,34 @@ define amdgpu_kernel void @global_singlethread_acq_rel_acquire_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_singlethread_acq_rel_acquire_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_singlethread_acq_rel_acquire_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -4654,6 +5624,34 @@ define amdgpu_kernel void @global_singlethread_seq_cst_acquire_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_singlethread_seq_cst_acquire_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_singlethread_seq_cst_acquire_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -4785,6 +5783,34 @@ define amdgpu_kernel void @global_singlethread_monotonic_seq_cst_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_singlethread_monotonic_seq_cst_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_singlethread_monotonic_seq_cst_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -4916,6 +5942,34 @@ define amdgpu_kernel void @global_singlethread_acquire_seq_cst_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_singlethread_acquire_seq_cst_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_singlethread_acquire_seq_cst_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -5047,6 +6101,34 @@ define amdgpu_kernel void @global_singlethread_release_seq_cst_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_singlethread_release_seq_cst_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_singlethread_release_seq_cst_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -5178,6 +6260,34 @@ define amdgpu_kernel void @global_singlethread_acq_rel_seq_cst_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_singlethread_acq_rel_seq_cst_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_singlethread_acq_rel_seq_cst_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -5309,6 +6419,34 @@ define amdgpu_kernel void @global_singlethread_seq_cst_seq_cst_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_singlethread_seq_cst_seq_cst_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_singlethread_seq_cst_seq_cst_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -5421,6 +6559,26 @@ define amdgpu_kernel void @global_singlethread_one_as_unordered_load(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_singlethread_one_as_unordered_load:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[0:1]
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[2:3]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_singlethread_one_as_unordered_load:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: global_load_b32 v1, v0, s[0:1]
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: global_store_b32 v0, v1, s[2:3]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
entry:
%val = load atomic i32, i32 addrspace(1)* %in syncscope("singlethread-one-as") unordered, align 4
@@ -5531,6 +6689,26 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_load(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_singlethread_one_as_monotonic_load:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[0:1]
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[2:3]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_singlethread_one_as_monotonic_load:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: global_load_b32 v1, v0, s[0:1]
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: global_store_b32 v0, v1, s[2:3]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
entry:
%val = load atomic i32, i32 addrspace(1)* %in syncscope("singlethread-one-as") monotonic, align 4
@@ -5641,6 +6819,26 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_load(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_singlethread_one_as_acquire_load:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[0:1]
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[2:3]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_singlethread_one_as_acquire_load:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: global_load_b32 v1, v0, s[0:1]
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: global_store_b32 v0, v1, s[2:3]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
entry:
%val = load atomic i32, i32 addrspace(1)* %in syncscope("singlethread-one-as") acquire, align 4
@@ -5751,6 +6949,26 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_load(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_singlethread_one_as_seq_cst_load:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[0:1]
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[2:3]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_singlethread_one_as_seq_cst_load:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: global_load_b32 v1, v0, s[0:1]
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: global_store_b32 v0, v1, s[2:3]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
entry:
%val = load atomic i32, i32 addrspace(1)* %in syncscope("singlethread-one-as") seq_cst, align 4
@@ -5853,6 +7071,28 @@ define amdgpu_kernel void @global_singlethread_one_as_unordered_store(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_singlethread_one_as_unordered_store:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_singlethread_one_as_unordered_store:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 %in, i32 addrspace(1)* %out) {
entry:
store atomic i32 %in, i32 addrspace(1)* %out syncscope("singlethread-one-as") unordered, align 4
@@ -5954,6 +7194,28 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_store(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_singlethread_one_as_monotonic_store:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_singlethread_one_as_monotonic_store:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 %in, i32 addrspace(1)* %out) {
entry:
store atomic i32 %in, i32 addrspace(1)* %out syncscope("singlethread-one-as") monotonic, align 4
@@ -6055,6 +7317,28 @@ define amdgpu_kernel void @global_singlethread_one_as_release_store(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_singlethread_one_as_release_store:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_singlethread_one_as_release_store:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 %in, i32 addrspace(1)* %out) {
entry:
store atomic i32 %in, i32 addrspace(1)* %out syncscope("singlethread-one-as") release, align 4
@@ -6156,6 +7440,28 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_store(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_singlethread_one_as_seq_cst_store:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_singlethread_one_as_seq_cst_store:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 %in, i32 addrspace(1)* %out) {
entry:
store atomic i32 %in, i32 addrspace(1)* %out syncscope("singlethread-one-as") seq_cst, align 4
@@ -6257,6 +7563,28 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_atomicrmw(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
; GFX940-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[2:3]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_singlethread_one_as_monotonic_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_singlethread_one_as_monotonic_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("singlethread-one-as") monotonic
@@ -6358,6 +7686,28 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_atomicrmw(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
; GFX940-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[2:3]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_singlethread_one_as_acquire_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_singlethread_one_as_acquire_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("singlethread-one-as") acquire
@@ -6459,6 +7809,28 @@ define amdgpu_kernel void @global_singlethread_one_as_release_atomicrmw(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
; GFX940-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[2:3]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_singlethread_one_as_release_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_singlethread_one_as_release_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("singlethread-one-as") release
@@ -6560,6 +7932,28 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_atomicrmw(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
; GFX940-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[2:3]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_singlethread_one_as_acq_rel_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_singlethread_one_as_acq_rel_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("singlethread-one-as") acq_rel
@@ -6661,6 +8055,28 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_atomicrmw(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
; GFX940-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[2:3]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_singlethread_one_as_seq_cst_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_singlethread_one_as_seq_cst_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("singlethread-one-as") seq_cst
@@ -6780,6 +8196,32 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_ret_atomicrmw(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_singlethread_one_as_acquire_ret_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_singlethread_one_as_acquire_ret_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("singlethread-one-as") acquire
@@ -6900,6 +8342,32 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_ret_atomicrmw(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_singlethread_one_as_acq_rel_ret_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_singlethread_one_as_acq_rel_ret_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("singlethread-one-as") acq_rel
@@ -7020,6 +8488,32 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_ret_atomicrmw(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_singlethread_one_as_seq_cst_ret_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_singlethread_one_as_seq_cst_ret_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("singlethread-one-as") seq_cst
@@ -7129,6 +8623,30 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_monotonic_cmpxch
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_singlethread_one_as_monotonic_monotonic_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_singlethread_one_as_monotonic_monotonic_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -7238,6 +8756,30 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_monotonic_cmpxchg(
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_singlethread_one_as_acquire_monotonic_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_singlethread_one_as_acquire_monotonic_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -7347,6 +8889,30 @@ define amdgpu_kernel void @global_singlethread_one_as_release_monotonic_cmpxchg(
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_singlethread_one_as_release_monotonic_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_singlethread_one_as_release_monotonic_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -7456,6 +9022,30 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_monotonic_cmpxchg(
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_singlethread_one_as_acq_rel_monotonic_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_singlethread_one_as_acq_rel_monotonic_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -7565,6 +9155,30 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_monotonic_cmpxchg(
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_singlethread_one_as_seq_cst_monotonic_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_singlethread_one_as_seq_cst_monotonic_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -7674,6 +9288,30 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_acquire_cmpxchg(
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_singlethread_one_as_monotonic_acquire_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_singlethread_one_as_monotonic_acquire_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -7783,6 +9421,30 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_acquire_cmpxchg(
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_singlethread_one_as_acquire_acquire_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_singlethread_one_as_acquire_acquire_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -7892,6 +9554,30 @@ define amdgpu_kernel void @global_singlethread_one_as_release_acquire_cmpxchg(
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_singlethread_one_as_release_acquire_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_singlethread_one_as_release_acquire_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -8001,6 +9687,30 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_acquire_cmpxchg(
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_singlethread_one_as_acq_rel_acquire_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_singlethread_one_as_acq_rel_acquire_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -8110,6 +9820,30 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_acquire_cmpxchg(
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_singlethread_one_as_seq_cst_acquire_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_singlethread_one_as_seq_cst_acquire_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -8219,6 +9953,30 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_seq_cst_cmpxchg(
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_singlethread_one_as_monotonic_seq_cst_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_singlethread_one_as_monotonic_seq_cst_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -8328,6 +10086,30 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_seq_cst_cmpxchg(
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_singlethread_one_as_acquire_seq_cst_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_singlethread_one_as_acquire_seq_cst_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -8437,6 +10219,30 @@ define amdgpu_kernel void @global_singlethread_one_as_release_seq_cst_cmpxchg(
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_singlethread_one_as_release_seq_cst_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_singlethread_one_as_release_seq_cst_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -8546,6 +10352,30 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_seq_cst_cmpxchg(
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_singlethread_one_as_acq_rel_seq_cst_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_singlethread_one_as_acq_rel_seq_cst_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -8655,6 +10485,30 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_seq_cst_cmpxchg(
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_singlethread_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_singlethread_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -8784,6 +10638,34 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_monotonic_ret_cm
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_singlethread_one_as_monotonic_monotonic_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_singlethread_one_as_monotonic_monotonic_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -8915,6 +10797,34 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_monotonic_ret_cmpx
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_singlethread_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_singlethread_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -9046,6 +10956,34 @@ define amdgpu_kernel void @global_singlethread_one_as_release_monotonic_ret_cmpx
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_singlethread_one_as_release_monotonic_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_singlethread_one_as_release_monotonic_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -9177,6 +11115,34 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_monotonic_ret_cmpx
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -9308,6 +11274,34 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_monotonic_ret_cmpx
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -9439,6 +11433,34 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_acquire_ret_cmpx
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_singlethread_one_as_monotonic_acquire_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_singlethread_one_as_monotonic_acquire_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -9570,6 +11592,34 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_acquire_ret_cmpxch
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_singlethread_one_as_acquire_acquire_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_singlethread_one_as_acquire_acquire_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -9701,6 +11751,34 @@ define amdgpu_kernel void @global_singlethread_one_as_release_acquire_ret_cmpxch
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_singlethread_one_as_release_acquire_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_singlethread_one_as_release_acquire_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -9832,6 +11910,34 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_acquire_ret_cmpxch
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_singlethread_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_singlethread_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -9963,6 +12069,34 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_acquire_ret_cmpxch
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_singlethread_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_singlethread_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -10094,6 +12228,34 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_seq_cst_ret_cmpx
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -10225,6 +12387,34 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_seq_cst_ret_cmpxch
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_singlethread_one_as_acquire_seq_cst_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_singlethread_one_as_acquire_seq_cst_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -10356,6 +12546,34 @@ define amdgpu_kernel void @global_singlethread_one_as_release_seq_cst_ret_cmpxch
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_singlethread_one_as_release_seq_cst_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_singlethread_one_as_release_seq_cst_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -10487,6 +12705,34 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_seq_cst_ret_cmpxch
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -10618,6 +12864,34 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_seq_cst_ret_cmpxch
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll
index 54407c1f5311..c8da74286bca 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll
@@ -8,6 +8,8 @@
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=+tgsplit -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-TGSPLIT %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-NOTTGSPLIT %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -mattr=+tgsplit -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-TGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11-WGP %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+cumode -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11-CU %s
define amdgpu_kernel void @global_system_unordered_load(
; GFX6-LABEL: global_system_unordered_load:
@@ -112,6 +114,26 @@ define amdgpu_kernel void @global_system_unordered_load(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_system_unordered_load:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[0:1]
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[2:3]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_system_unordered_load:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: global_load_b32 v1, v0, s[0:1]
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: global_store_b32 v0, v1, s[2:3]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
entry:
%val = load atomic i32, i32 addrspace(1)* %in unordered, align 4
@@ -222,6 +244,26 @@ define amdgpu_kernel void @global_system_monotonic_load(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_system_monotonic_load:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[0:1] glc dlc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[2:3]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_system_monotonic_load:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: global_load_b32 v1, v0, s[0:1] glc dlc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: global_store_b32 v0, v1, s[2:3]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
entry:
%val = load atomic i32, i32 addrspace(1)* %in monotonic, align 4
@@ -344,6 +386,30 @@ define amdgpu_kernel void @global_system_acquire_load(
; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_system_acquire_load:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[0:1] glc dlc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[2:3]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_system_acquire_load:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: global_load_b32 v1, v0, s[0:1] glc dlc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: global_store_b32 v0, v1, s[2:3]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
entry:
%val = load atomic i32, i32 addrspace(1)* %in acquire, align 4
@@ -471,6 +537,32 @@ define amdgpu_kernel void @global_system_seq_cst_load(
; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_system_seq_cst_load:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[0:1] glc dlc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[2:3]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_system_seq_cst_load:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: global_load_b32 v1, v0, s[0:1] glc dlc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: global_store_b32 v0, v1, s[2:3]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
entry:
%val = load atomic i32, i32 addrspace(1)* %in seq_cst, align 4
@@ -573,6 +665,28 @@ define amdgpu_kernel void @global_system_unordered_store(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_system_unordered_store:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_system_unordered_store:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 %in, i32 addrspace(1)* %out) {
entry:
store atomic i32 %in, i32 addrspace(1)* %out unordered, align 4
@@ -674,6 +788,28 @@ define amdgpu_kernel void @global_system_monotonic_store(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_system_monotonic_store:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_system_monotonic_store:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 %in, i32 addrspace(1)* %out) {
entry:
store atomic i32 %in, i32 addrspace(1)* %out monotonic, align 4
@@ -790,6 +926,32 @@ define amdgpu_kernel void @global_system_release_store(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_system_release_store:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_system_release_store:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 %in, i32 addrspace(1)* %out) {
entry:
store atomic i32 %in, i32 addrspace(1)* %out release, align 4
@@ -906,6 +1068,32 @@ define amdgpu_kernel void @global_system_seq_cst_store(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_system_seq_cst_store:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_system_seq_cst_store:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 %in, i32 addrspace(1)* %out) {
entry:
store atomic i32 %in, i32 addrspace(1)* %out seq_cst, align 4
@@ -1007,6 +1195,28 @@ define amdgpu_kernel void @global_system_monotonic_atomicrmw(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
; GFX940-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[2:3] sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_system_monotonic_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_system_monotonic_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in monotonic
@@ -1129,6 +1339,34 @@ define amdgpu_kernel void @global_system_acquire_atomicrmw(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_system_acquire_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_system_acquire_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in acquire
@@ -1245,6 +1483,32 @@ define amdgpu_kernel void @global_system_release_atomicrmw(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[2:3] sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_system_release_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_system_release_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in release
@@ -1382,6 +1646,38 @@ define amdgpu_kernel void @global_system_acq_rel_atomicrmw(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_system_acq_rel_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_system_acq_rel_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in acq_rel
@@ -1519,6 +1815,38 @@ define amdgpu_kernel void @global_system_seq_cst_atomicrmw(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_system_seq_cst_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_system_seq_cst_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in seq_cst
@@ -1650,6 +1978,36 @@ define amdgpu_kernel void @global_system_acquire_ret_atomicrmw(
; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_system_acquire_ret_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_system_acquire_ret_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in acquire
@@ -1797,6 +2155,40 @@ define amdgpu_kernel void @global_system_acq_rel_ret_atomicrmw(
; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_system_acq_rel_ret_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_system_acq_rel_ret_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in acq_rel
@@ -1944,6 +2336,40 @@ define amdgpu_kernel void @global_system_seq_cst_ret_atomicrmw(
; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_system_seq_cst_ret_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_system_seq_cst_ret_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in seq_cst
@@ -2053,6 +2479,30 @@ define amdgpu_kernel void @global_system_monotonic_monotonic_cmpxchg(
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_system_monotonic_monotonic_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_system_monotonic_monotonic_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -2183,6 +2633,36 @@ define amdgpu_kernel void @global_system_acquire_monotonic_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_system_acquire_monotonic_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_system_acquire_monotonic_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -2307,6 +2787,34 @@ define amdgpu_kernel void @global_system_release_monotonic_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_system_release_monotonic_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_system_release_monotonic_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -2452,6 +2960,40 @@ define amdgpu_kernel void @global_system_acq_rel_monotonic_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_system_acq_rel_monotonic_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_system_acq_rel_monotonic_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -2597,6 +3139,40 @@ define amdgpu_kernel void @global_system_seq_cst_monotonic_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_system_seq_cst_monotonic_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_system_seq_cst_monotonic_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -2727,6 +3303,36 @@ define amdgpu_kernel void @global_system_monotonic_acquire_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_system_monotonic_acquire_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_system_monotonic_acquire_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -2857,6 +3463,36 @@ define amdgpu_kernel void @global_system_acquire_acquire_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_system_acquire_acquire_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_system_acquire_acquire_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -3002,6 +3638,40 @@ define amdgpu_kernel void @global_system_release_acquire_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_system_release_acquire_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_system_release_acquire_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -3147,6 +3817,40 @@ define amdgpu_kernel void @global_system_acq_rel_acquire_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_system_acq_rel_acquire_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_system_acq_rel_acquire_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -3292,6 +3996,40 @@ define amdgpu_kernel void @global_system_seq_cst_acquire_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_system_seq_cst_acquire_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_system_seq_cst_acquire_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -3437,6 +4175,40 @@ define amdgpu_kernel void @global_system_seq_cst_seq_cst_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_system_seq_cst_seq_cst_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_system_seq_cst_seq_cst_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -3566,6 +4338,34 @@ define amdgpu_kernel void @global_system_monotonic_monotonic_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_system_monotonic_monotonic_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_system_monotonic_monotonic_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -3709,6 +4509,38 @@ define amdgpu_kernel void @global_system_acquire_monotonic_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_system_acquire_monotonic_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_system_acquire_monotonic_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -3867,6 +4699,42 @@ define amdgpu_kernel void @global_system_acq_rel_monotonic_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_system_acq_rel_monotonic_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_system_acq_rel_monotonic_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -4025,6 +4893,42 @@ define amdgpu_kernel void @global_system_seq_cst_monotonic_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_system_seq_cst_monotonic_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_system_seq_cst_monotonic_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -4168,6 +5072,38 @@ define amdgpu_kernel void @global_system_monotonic_acquire_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_system_monotonic_acquire_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_system_monotonic_acquire_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -4311,6 +5247,38 @@ define amdgpu_kernel void @global_system_acquire_acquire_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_system_acquire_acquire_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_system_acquire_acquire_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -4469,6 +5437,42 @@ define amdgpu_kernel void @global_system_release_acquire_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_system_release_acquire_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_system_release_acquire_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -4627,6 +5631,42 @@ define amdgpu_kernel void @global_system_acq_rel_acquire_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_system_acq_rel_acquire_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_system_acq_rel_acquire_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -4785,6 +5825,42 @@ define amdgpu_kernel void @global_system_seq_cst_acquire_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_system_seq_cst_acquire_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_system_seq_cst_acquire_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -4943,6 +6019,42 @@ define amdgpu_kernel void @global_system_monotonic_seq_cst_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_system_monotonic_seq_cst_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_system_monotonic_seq_cst_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -5101,6 +6213,42 @@ define amdgpu_kernel void @global_system_acquire_seq_cst_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_system_acquire_seq_cst_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_system_acquire_seq_cst_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -5259,6 +6407,42 @@ define amdgpu_kernel void @global_system_relese_seq_cst_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_system_relese_seq_cst_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_system_relese_seq_cst_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -5417,6 +6601,42 @@ define amdgpu_kernel void @global_system_acq_rel_seq_cst_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_system_acq_rel_seq_cst_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_system_acq_rel_seq_cst_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -5575,6 +6795,42 @@ define amdgpu_kernel void @global_system_seq_cst_seq_cst_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_system_seq_cst_seq_cst_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_system_seq_cst_seq_cst_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -5687,6 +6943,26 @@ define amdgpu_kernel void @global_system_one_as_unordered_load(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_system_one_as_unordered_load:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[0:1]
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[2:3]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_system_one_as_unordered_load:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: global_load_b32 v1, v0, s[0:1]
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: global_store_b32 v0, v1, s[2:3]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
entry:
%val = load atomic i32, i32 addrspace(1)* %in syncscope("one-as") unordered, align 4
@@ -5797,6 +7073,26 @@ define amdgpu_kernel void @global_system_one_as_monotonic_load(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_system_one_as_monotonic_load:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[0:1] glc dlc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[2:3]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_system_one_as_monotonic_load:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: global_load_b32 v1, v0, s[0:1] glc dlc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: global_store_b32 v0, v1, s[2:3]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
entry:
%val = load atomic i32, i32 addrspace(1)* %in syncscope("one-as") monotonic, align 4
@@ -5919,6 +7215,30 @@ define amdgpu_kernel void @global_system_one_as_acquire_load(
; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_system_one_as_acquire_load:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[0:1] glc dlc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[2:3]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_system_one_as_acquire_load:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: global_load_b32 v1, v0, s[0:1] glc dlc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: global_store_b32 v0, v1, s[2:3]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
entry:
%val = load atomic i32, i32 addrspace(1)* %in syncscope("one-as") acquire, align 4
@@ -6046,6 +7366,32 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_load(
; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_system_one_as_seq_cst_load:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[0:1] glc dlc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[2:3]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_system_one_as_seq_cst_load:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: global_load_b32 v1, v0, s[0:1] glc dlc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: global_store_b32 v0, v1, s[2:3]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
entry:
%val = load atomic i32, i32 addrspace(1)* %in syncscope("one-as") seq_cst, align 4
@@ -6148,6 +7494,28 @@ define amdgpu_kernel void @global_system_one_as_unordered_store(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_system_one_as_unordered_store:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_system_one_as_unordered_store:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 %in, i32 addrspace(1)* %out) {
entry:
store atomic i32 %in, i32 addrspace(1)* %out syncscope("one-as") unordered, align 4
@@ -6249,6 +7617,28 @@ define amdgpu_kernel void @global_system_one_as_monotonic_store(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_system_one_as_monotonic_store:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_system_one_as_monotonic_store:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 %in, i32 addrspace(1)* %out) {
entry:
store atomic i32 %in, i32 addrspace(1)* %out syncscope("one-as") monotonic, align 4
@@ -6365,6 +7755,32 @@ define amdgpu_kernel void @global_system_one_as_release_store(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_system_one_as_release_store:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_system_one_as_release_store:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 %in, i32 addrspace(1)* %out) {
entry:
store atomic i32 %in, i32 addrspace(1)* %out syncscope("one-as") release, align 4
@@ -6481,6 +7897,32 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_store(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] sc0 sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_system_one_as_seq_cst_store:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_system_one_as_seq_cst_store:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 %in, i32 addrspace(1)* %out) {
entry:
store atomic i32 %in, i32 addrspace(1)* %out syncscope("one-as") seq_cst, align 4
@@ -6582,6 +8024,28 @@ define amdgpu_kernel void @global_system_one_as_monotonic_atomicrmw(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
; GFX940-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[2:3] sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_system_one_as_monotonic_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_system_one_as_monotonic_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("one-as") monotonic
@@ -6704,6 +8168,34 @@ define amdgpu_kernel void @global_system_one_as_acquire_atomicrmw(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_system_one_as_acquire_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_system_one_as_acquire_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("one-as") acquire
@@ -6820,6 +8312,32 @@ define amdgpu_kernel void @global_system_one_as_release_atomicrmw(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[2:3] sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_system_one_as_release_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_system_one_as_release_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("one-as") release
@@ -6957,6 +8475,38 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_atomicrmw(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_system_one_as_acq_rel_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_system_one_as_acq_rel_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("one-as") acq_rel
@@ -7094,6 +8644,38 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_atomicrmw(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_system_one_as_seq_cst_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_system_one_as_seq_cst_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("one-as") seq_cst
@@ -7225,6 +8807,36 @@ define amdgpu_kernel void @global_system_one_as_acquire_ret_atomicrmw(
; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_system_one_as_acquire_ret_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_system_one_as_acquire_ret_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("one-as") acquire
@@ -7372,6 +8984,40 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_ret_atomicrmw(
; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_system_one_as_acq_rel_ret_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_system_one_as_acq_rel_ret_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("one-as") acq_rel
@@ -7519,6 +9165,40 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_ret_atomicrmw(
; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_system_one_as_seq_cst_ret_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_system_one_as_seq_cst_ret_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("one-as") seq_cst
@@ -7628,6 +9308,30 @@ define amdgpu_kernel void @global_system_one_as_monotonic_monotonic_cmpxchg(
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_system_one_as_monotonic_monotonic_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_system_one_as_monotonic_monotonic_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -7758,6 +9462,36 @@ define amdgpu_kernel void @global_system_one_as_acquire_monotonic_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_system_one_as_acquire_monotonic_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_system_one_as_acquire_monotonic_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -7882,6 +9616,34 @@ define amdgpu_kernel void @global_system_one_as_release_monotonic_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16 sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_system_one_as_release_monotonic_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_system_one_as_release_monotonic_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -8027,6 +9789,40 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_monotonic_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_system_one_as_acq_rel_monotonic_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_system_one_as_acq_rel_monotonic_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -8172,6 +9968,40 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_monotonic_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_system_one_as_seq_cst_monotonic_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_system_one_as_seq_cst_monotonic_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -8302,6 +10132,36 @@ define amdgpu_kernel void @global_system_one_as_monotonic_acquire_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_system_one_as_monotonic_acquire_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_system_one_as_monotonic_acquire_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -8432,6 +10292,36 @@ define amdgpu_kernel void @global_system_one_as_acquire_acquire_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_system_one_as_acquire_acquire_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_system_one_as_acquire_acquire_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -8577,6 +10467,40 @@ define amdgpu_kernel void @global_system_one_as_release_acquire_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_system_one_as_release_acquire_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_system_one_as_release_acquire_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -8722,6 +10646,40 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_acquire_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_system_one_as_acq_rel_acquire_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_system_one_as_acq_rel_acquire_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -8867,6 +10825,40 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_acquire_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_system_one_as_seq_cst_acquire_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_system_one_as_seq_cst_acquire_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -9012,6 +11004,40 @@ define amdgpu_kernel void @global_system_one_as_monotonic_seq_cst_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_system_one_as_monotonic_seq_cst_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_system_one_as_monotonic_seq_cst_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -9157,6 +11183,40 @@ define amdgpu_kernel void @global_system_one_as_acquire_seq_cst_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_system_one_as_acquire_seq_cst_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_system_one_as_acquire_seq_cst_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -9302,6 +11362,40 @@ define amdgpu_kernel void @global_system_one_as_release_seq_cst_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_system_one_as_release_seq_cst_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_system_one_as_release_seq_cst_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -9447,6 +11541,40 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_seq_cst_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_system_one_as_acq_rel_seq_cst_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_system_one_as_acq_rel_seq_cst_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -9592,6 +11720,40 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_seq_cst_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_system_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_system_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -9721,6 +11883,34 @@ define amdgpu_kernel void @global_system_one_as_monotonic_monotonic_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_system_one_as_monotonic_monotonic_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_system_one_as_monotonic_monotonic_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -9864,6 +12054,38 @@ define amdgpu_kernel void @global_system_one_as_acquire_monotonic_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_system_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_system_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -10010,6 +12232,38 @@ define amdgpu_kernel void @global_system_one_as_release_monotonic_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_system_one_as_release_monotonic_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_system_one_as_release_monotonic_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -10168,6 +12422,42 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_monotonic_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_system_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_system_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -10326,6 +12616,42 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_monotonic_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_system_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_system_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -10469,6 +12795,38 @@ define amdgpu_kernel void @global_system_one_as_monotonic_acquire_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_system_one_as_monotonic_acquire_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_system_one_as_monotonic_acquire_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -10612,6 +12970,38 @@ define amdgpu_kernel void @global_system_one_as_acquire_acquire_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_system_one_as_acquire_acquire_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_system_one_as_acquire_acquire_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -10770,6 +13160,42 @@ define amdgpu_kernel void @global_system_one_as_release_acquire_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_system_one_as_release_acquire_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_system_one_as_release_acquire_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -10928,6 +13354,42 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_acquire_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_system_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_system_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -11086,6 +13548,42 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_acquire_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_system_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_system_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -11244,6 +13742,42 @@ define amdgpu_kernel void @global_system_one_as_monotonic_seq_cst_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_system_one_as_monotonic_seq_cst_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_system_one_as_monotonic_seq_cst_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -11402,6 +13936,42 @@ define amdgpu_kernel void @global_system_one_as_acquire_seq_cst_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_system_one_as_acquire_seq_cst_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_system_one_as_acquire_seq_cst_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -11560,6 +14130,42 @@ define amdgpu_kernel void @global_system_one_as_release_seq_cst_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_system_one_as_release_seq_cst_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_system_one_as_release_seq_cst_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -11718,6 +14324,42 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_seq_cst_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_system_one_as_acq_rel_seq_cst_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_system_one_as_acq_rel_seq_cst_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -11876,6 +14518,42 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_seq_cst_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: buffer_inv sc0 sc1
; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_system_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: buffer_gl1_inv
+; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_system_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: buffer_gl0_inv
+; GFX11-CU-NEXT: buffer_gl1_inv
+; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll
index dacc965f269c..6d47f91198d4 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll
@@ -4,6 +4,8 @@
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-WGP %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+cumode -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-CU %s
; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx700 -amdgcn-skip-cache-invalidations -verify-machineinstrs < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11-WGP %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+cumode -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11-CU %s
define amdgpu_kernel void @global_volatile_load_0(
; GFX6-LABEL: global_volatile_load_0:
@@ -68,6 +70,26 @@ define amdgpu_kernel void @global_volatile_load_0(
; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s3
; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0
; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_volatile_load_0:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[0:1] glc dlc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[2:3]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_volatile_load_0:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: global_load_b32 v1, v0, s[0:1] glc dlc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: global_store_b32 v0, v1, s[2:3]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
entry:
%val = load volatile i32, i32 addrspace(1)* %in, align 4
@@ -146,6 +168,28 @@ define amdgpu_kernel void @global_volatile_load_1(
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0
; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_volatile_load_1:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: global_load_b32 v0, v0, s[0:1] glc dlc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: global_store_b32 v1, v0, s[2:3]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_volatile_load_1:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: global_load_b32 v0, v0, s[0:1] glc dlc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: global_store_b32 v1, v0, s[2:3]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -222,6 +266,30 @@ define amdgpu_kernel void @global_volatile_store_0(
; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_volatile_store_0:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0
+; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[2:3]
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_volatile_store_0:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0
+; GFX11-CU-NEXT: global_store_b32 v0, v1, s[2:3]
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
entry:
%val = load i32, i32 addrspace(1)* %in, align 4
@@ -300,6 +368,30 @@ define amdgpu_kernel void @global_volatile_store_1(
; SKIP-CACHE-INV-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_volatile_store_1:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0
+; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[2:3]
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_volatile_store_1:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0
+; GFX11-CU-NEXT: global_store_b32 v0, v1, s[2:3]
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -373,6 +465,27 @@ define amdgpu_kernel void @global_volatile_workgroup_acquire_load(
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0
; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_volatile_workgroup_acquire_load:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[0:1] glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[2:3]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_volatile_workgroup_acquire_load:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: global_load_b32 v1, v0, s[0:1]
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: global_store_b32 v0, v1, s[2:3]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
entry:
%val = load atomic volatile i32, i32 addrspace(1)* %in syncscope("workgroup") acquire, align 4
@@ -441,6 +554,31 @@ define amdgpu_kernel void @global_volatile_workgroup_release_store(
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_volatile_workgroup_release_store:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_volatile_workgroup_release_store:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 %in, i32 addrspace(1)* %out) {
entry:
store atomic volatile i32 %in, i32 addrspace(1)* %out syncscope("workgroup") release, align 4
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-wavefront.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-wavefront.ll
index 32e62b36b322..037ee7c38fd6 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-wavefront.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-wavefront.ll
@@ -8,6 +8,8 @@
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=+tgsplit -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-TGSPLIT %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-NOTTGSPLIT %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -mattr=+tgsplit -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-TGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11-WGP %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+cumode -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11-CU %s
define amdgpu_kernel void @global_wavefront_unordered_load(
; GFX6-LABEL: global_wavefront_unordered_load:
@@ -112,6 +114,26 @@ define amdgpu_kernel void @global_wavefront_unordered_load(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_wavefront_unordered_load:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[0:1]
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[2:3]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_wavefront_unordered_load:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: global_load_b32 v1, v0, s[0:1]
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: global_store_b32 v0, v1, s[2:3]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
entry:
%val = load atomic i32, i32 addrspace(1)* %in syncscope("wavefront") unordered, align 4
@@ -222,6 +244,26 @@ define amdgpu_kernel void @global_wavefront_monotonic_load(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_wavefront_monotonic_load:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[0:1]
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[2:3]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_wavefront_monotonic_load:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: global_load_b32 v1, v0, s[0:1]
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: global_store_b32 v0, v1, s[2:3]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
entry:
%val = load atomic i32, i32 addrspace(1)* %in syncscope("wavefront") monotonic, align 4
@@ -332,6 +374,26 @@ define amdgpu_kernel void @global_wavefront_acquire_load(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_wavefront_acquire_load:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[0:1]
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[2:3]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_wavefront_acquire_load:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: global_load_b32 v1, v0, s[0:1]
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: global_store_b32 v0, v1, s[2:3]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
entry:
%val = load atomic i32, i32 addrspace(1)* %in syncscope("wavefront") acquire, align 4
@@ -442,6 +504,26 @@ define amdgpu_kernel void @global_wavefront_seq_cst_load(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_wavefront_seq_cst_load:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[0:1]
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[2:3]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_wavefront_seq_cst_load:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: global_load_b32 v1, v0, s[0:1]
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: global_store_b32 v0, v1, s[2:3]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
entry:
%val = load atomic i32, i32 addrspace(1)* %in syncscope("wavefront") seq_cst, align 4
@@ -544,6 +626,28 @@ define amdgpu_kernel void @global_wavefront_unordered_store(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_wavefront_unordered_store:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_wavefront_unordered_store:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 %in, i32 addrspace(1)* %out) {
entry:
store atomic i32 %in, i32 addrspace(1)* %out syncscope("wavefront") unordered, align 4
@@ -645,6 +749,28 @@ define amdgpu_kernel void @global_wavefront_monotonic_store(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_wavefront_monotonic_store:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_wavefront_monotonic_store:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 %in, i32 addrspace(1)* %out) {
entry:
store atomic i32 %in, i32 addrspace(1)* %out syncscope("wavefront") monotonic, align 4
@@ -746,6 +872,28 @@ define amdgpu_kernel void @global_wavefront_release_store(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_wavefront_release_store:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_wavefront_release_store:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 %in, i32 addrspace(1)* %out) {
entry:
store atomic i32 %in, i32 addrspace(1)* %out syncscope("wavefront") release, align 4
@@ -847,6 +995,28 @@ define amdgpu_kernel void @global_wavefront_seq_cst_store(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_wavefront_seq_cst_store:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_wavefront_seq_cst_store:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 %in, i32 addrspace(1)* %out) {
entry:
store atomic i32 %in, i32 addrspace(1)* %out syncscope("wavefront") seq_cst, align 4
@@ -948,6 +1118,28 @@ define amdgpu_kernel void @global_wavefront_monotonic_atomicrmw(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
; GFX940-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[2:3]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_wavefront_monotonic_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_wavefront_monotonic_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("wavefront") monotonic
@@ -1049,6 +1241,28 @@ define amdgpu_kernel void @global_wavefront_acquire_atomicrmw(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
; GFX940-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[2:3]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_wavefront_acquire_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_wavefront_acquire_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("wavefront") acquire
@@ -1150,6 +1364,28 @@ define amdgpu_kernel void @global_wavefront_release_atomicrmw(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
; GFX940-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[2:3]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_wavefront_release_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_wavefront_release_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("wavefront") release
@@ -1251,6 +1487,28 @@ define amdgpu_kernel void @global_wavefront_acq_rel_atomicrmw(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
; GFX940-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[2:3]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_wavefront_acq_rel_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_wavefront_acq_rel_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("wavefront") acq_rel
@@ -1352,6 +1610,28 @@ define amdgpu_kernel void @global_wavefront_seq_cst_atomicrmw(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
; GFX940-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[2:3]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_wavefront_seq_cst_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_wavefront_seq_cst_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("wavefront") seq_cst
@@ -1471,6 +1751,32 @@ define amdgpu_kernel void @global_wavefront_acquire_ret_atomicrmw(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_wavefront_acquire_ret_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_wavefront_acquire_ret_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("wavefront") acquire
@@ -1591,6 +1897,32 @@ define amdgpu_kernel void @global_wavefront_acq_rel_ret_atomicrmw(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_wavefront_acq_rel_ret_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_wavefront_acq_rel_ret_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("wavefront") acq_rel
@@ -1711,6 +2043,32 @@ define amdgpu_kernel void @global_wavefront_seq_cst_ret_atomicrmw(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_wavefront_seq_cst_ret_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_wavefront_seq_cst_ret_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("wavefront") seq_cst
@@ -1820,6 +2178,30 @@ define amdgpu_kernel void @global_wavefront_monotonic_monotonic_cmpxchg(
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_wavefront_monotonic_monotonic_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_wavefront_monotonic_monotonic_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -1929,6 +2311,30 @@ define amdgpu_kernel void @global_wavefront_acquire_monotonic_cmpxchg(
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_wavefront_acquire_monotonic_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_wavefront_acquire_monotonic_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -2038,6 +2444,30 @@ define amdgpu_kernel void @global_wavefront_release_monotonic_cmpxchg(
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_wavefront_release_monotonic_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_wavefront_release_monotonic_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -2147,6 +2577,30 @@ define amdgpu_kernel void @global_wavefront_acq_rel_monotonic_cmpxchg(
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_wavefront_acq_rel_monotonic_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_wavefront_acq_rel_monotonic_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -2256,6 +2710,30 @@ define amdgpu_kernel void @global_wavefront_seq_cst_monotonic_cmpxchg(
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_wavefront_seq_cst_monotonic_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_wavefront_seq_cst_monotonic_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -2365,6 +2843,30 @@ define amdgpu_kernel void @global_wavefront_monotonic_acquire_cmpxchg(
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_wavefront_monotonic_acquire_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_wavefront_monotonic_acquire_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -2474,6 +2976,30 @@ define amdgpu_kernel void @global_wavefront_acquire_acquire_cmpxchg(
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_wavefront_acquire_acquire_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_wavefront_acquire_acquire_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -2583,6 +3109,30 @@ define amdgpu_kernel void @global_wavefront_release_acquire_cmpxchg(
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_wavefront_release_acquire_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_wavefront_release_acquire_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -2692,6 +3242,30 @@ define amdgpu_kernel void @global_wavefront_acq_rel_acquire_cmpxchg(
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_wavefront_acq_rel_acquire_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_wavefront_acq_rel_acquire_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -2801,6 +3375,30 @@ define amdgpu_kernel void @global_wavefront_seq_cst_acquire_cmpxchg(
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_wavefront_seq_cst_acquire_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_wavefront_seq_cst_acquire_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -2910,6 +3508,30 @@ define amdgpu_kernel void @global_wavefront_monotonic_seq_cst_cmpxchg(
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_wavefront_monotonic_seq_cst_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_wavefront_monotonic_seq_cst_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -3019,6 +3641,30 @@ define amdgpu_kernel void @global_wavefront_acquire_seq_cst_cmpxchg(
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_wavefront_acquire_seq_cst_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_wavefront_acquire_seq_cst_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -3128,6 +3774,30 @@ define amdgpu_kernel void @global_wavefront_release_seq_cst_cmpxchg(
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_wavefront_release_seq_cst_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_wavefront_release_seq_cst_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -3237,6 +3907,30 @@ define amdgpu_kernel void @global_wavefront_acq_rel_seq_cst_cmpxchg(
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_wavefront_acq_rel_seq_cst_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_wavefront_acq_rel_seq_cst_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -3346,6 +4040,30 @@ define amdgpu_kernel void @global_wavefront_seq_cst_seq_cst_cmpxchg(
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_wavefront_seq_cst_seq_cst_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_wavefront_seq_cst_seq_cst_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -3475,6 +4193,34 @@ define amdgpu_kernel void @global_wavefront_monotonic_monotonic_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_wavefront_monotonic_monotonic_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_wavefront_monotonic_monotonic_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -3606,6 +4352,34 @@ define amdgpu_kernel void @global_wavefront_acquire_monotonic_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_wavefront_acquire_monotonic_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_wavefront_acquire_monotonic_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -3737,6 +4511,34 @@ define amdgpu_kernel void @global_wavefront_release_monotonic_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_wavefront_release_monotonic_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_wavefront_release_monotonic_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -3868,6 +4670,34 @@ define amdgpu_kernel void @global_wavefront_acq_rel_monotonic_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_wavefront_acq_rel_monotonic_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_wavefront_acq_rel_monotonic_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -3999,6 +4829,34 @@ define amdgpu_kernel void @global_wavefront_seq_cst_monotonic_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_wavefront_seq_cst_monotonic_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_wavefront_seq_cst_monotonic_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -4130,6 +4988,34 @@ define amdgpu_kernel void @global_wavefront_monotonic_acquire_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_wavefront_monotonic_acquire_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_wavefront_monotonic_acquire_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -4261,6 +5147,34 @@ define amdgpu_kernel void @global_wavefront_acquire_acquire_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_wavefront_acquire_acquire_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_wavefront_acquire_acquire_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -4392,6 +5306,34 @@ define amdgpu_kernel void @global_wavefront_release_acquire_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_wavefront_release_acquire_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_wavefront_release_acquire_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -4523,6 +5465,34 @@ define amdgpu_kernel void @global_wavefront_acq_rel_acquire_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_wavefront_acq_rel_acquire_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_wavefront_acq_rel_acquire_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -4654,6 +5624,34 @@ define amdgpu_kernel void @global_wavefront_seq_cst_acquire_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_wavefront_seq_cst_acquire_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_wavefront_seq_cst_acquire_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -4785,6 +5783,34 @@ define amdgpu_kernel void @global_wavefront_monotonic_seq_cst_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_wavefront_monotonic_seq_cst_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_wavefront_monotonic_seq_cst_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -4916,6 +5942,34 @@ define amdgpu_kernel void @global_wavefront_acquire_seq_cst_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_wavefront_acquire_seq_cst_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_wavefront_acquire_seq_cst_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -5047,6 +6101,34 @@ define amdgpu_kernel void @global_wavefront_release_seq_cst_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_wavefront_release_seq_cst_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_wavefront_release_seq_cst_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -5178,6 +6260,34 @@ define amdgpu_kernel void @global_wavefront_acq_rel_seq_cst_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_wavefront_acq_rel_seq_cst_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_wavefront_acq_rel_seq_cst_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -5309,6 +6419,34 @@ define amdgpu_kernel void @global_wavefront_seq_cst_seq_cst_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_wavefront_seq_cst_seq_cst_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_wavefront_seq_cst_seq_cst_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -5421,6 +6559,26 @@ define amdgpu_kernel void @global_wavefront_one_as_unordered_load(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_wavefront_one_as_unordered_load:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[0:1]
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[2:3]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_wavefront_one_as_unordered_load:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: global_load_b32 v1, v0, s[0:1]
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: global_store_b32 v0, v1, s[2:3]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
entry:
%val = load atomic i32, i32 addrspace(1)* %in syncscope("wavefront-one-as") unordered, align 4
@@ -5531,6 +6689,26 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_load(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_wavefront_one_as_monotonic_load:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[0:1]
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[2:3]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_wavefront_one_as_monotonic_load:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: global_load_b32 v1, v0, s[0:1]
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: global_store_b32 v0, v1, s[2:3]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
entry:
%val = load atomic i32, i32 addrspace(1)* %in syncscope("wavefront-one-as") monotonic, align 4
@@ -5641,6 +6819,26 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_load(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_wavefront_one_as_acquire_load:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[0:1]
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[2:3]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_wavefront_one_as_acquire_load:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: global_load_b32 v1, v0, s[0:1]
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: global_store_b32 v0, v1, s[2:3]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
entry:
%val = load atomic i32, i32 addrspace(1)* %in syncscope("wavefront-one-as") acquire, align 4
@@ -5751,6 +6949,26 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_load(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_wavefront_one_as_seq_cst_load:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[0:1]
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[2:3]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_wavefront_one_as_seq_cst_load:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: global_load_b32 v1, v0, s[0:1]
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: global_store_b32 v0, v1, s[2:3]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
entry:
%val = load atomic i32, i32 addrspace(1)* %in syncscope("wavefront-one-as") seq_cst, align 4
@@ -5853,6 +7071,28 @@ define amdgpu_kernel void @global_wavefront_one_as_unordered_store(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_wavefront_one_as_unordered_store:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_wavefront_one_as_unordered_store:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 %in, i32 addrspace(1)* %out) {
entry:
store atomic i32 %in, i32 addrspace(1)* %out syncscope("wavefront-one-as") unordered, align 4
@@ -5954,6 +7194,28 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_store(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_wavefront_one_as_monotonic_store:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_wavefront_one_as_monotonic_store:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 %in, i32 addrspace(1)* %out) {
entry:
store atomic i32 %in, i32 addrspace(1)* %out syncscope("wavefront-one-as") monotonic, align 4
@@ -6055,6 +7317,28 @@ define amdgpu_kernel void @global_wavefront_one_as_release_store(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_wavefront_one_as_release_store:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_wavefront_one_as_release_store:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 %in, i32 addrspace(1)* %out) {
entry:
store atomic i32 %in, i32 addrspace(1)* %out syncscope("wavefront-one-as") release, align 4
@@ -6156,6 +7440,28 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_store(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_wavefront_one_as_seq_cst_store:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_wavefront_one_as_seq_cst_store:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 %in, i32 addrspace(1)* %out) {
entry:
store atomic i32 %in, i32 addrspace(1)* %out syncscope("wavefront-one-as") seq_cst, align 4
@@ -6257,6 +7563,28 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_atomicrmw(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
; GFX940-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[2:3]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_wavefront_one_as_monotonic_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_wavefront_one_as_monotonic_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("wavefront-one-as") monotonic
@@ -6358,6 +7686,28 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_atomicrmw(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
; GFX940-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[2:3]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_wavefront_one_as_acquire_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_wavefront_one_as_acquire_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("wavefront-one-as") acquire
@@ -6459,6 +7809,28 @@ define amdgpu_kernel void @global_wavefront_one_as_release_atomicrmw(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
; GFX940-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[2:3]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_wavefront_one_as_release_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_wavefront_one_as_release_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("wavefront-one-as") release
@@ -6560,6 +7932,28 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_atomicrmw(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
; GFX940-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[2:3]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_wavefront_one_as_acq_rel_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_wavefront_one_as_acq_rel_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("wavefront-one-as") acq_rel
@@ -6661,6 +8055,28 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_atomicrmw(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
; GFX940-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[2:3]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_wavefront_one_as_seq_cst_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_wavefront_one_as_seq_cst_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("wavefront-one-as") seq_cst
@@ -6780,6 +8196,32 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_ret_atomicrmw(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_wavefront_one_as_acquire_ret_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_wavefront_one_as_acquire_ret_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("wavefront-one-as") acquire
@@ -6900,6 +8342,32 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_ret_atomicrmw(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_wavefront_one_as_acq_rel_ret_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_wavefront_one_as_acq_rel_ret_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("wavefront-one-as") acq_rel
@@ -7020,6 +8488,32 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_ret_atomicrmw(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_wavefront_one_as_seq_cst_ret_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_wavefront_one_as_seq_cst_ret_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("wavefront-one-as") seq_cst
@@ -7129,6 +8623,30 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_monotonic_cmpxchg(
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_wavefront_one_as_monotonic_monotonic_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_wavefront_one_as_monotonic_monotonic_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -7238,6 +8756,30 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_monotonic_cmpxchg(
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_wavefront_one_as_acquire_monotonic_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_wavefront_one_as_acquire_monotonic_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -7347,6 +8889,30 @@ define amdgpu_kernel void @global_wavefront_one_as_release_monotonic_cmpxchg(
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_wavefront_one_as_release_monotonic_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_wavefront_one_as_release_monotonic_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -7456,6 +9022,30 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_monotonic_cmpxchg(
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_wavefront_one_as_acq_rel_monotonic_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_wavefront_one_as_acq_rel_monotonic_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -7565,6 +9155,30 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_monotonic_cmpxchg(
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_wavefront_one_as_seq_cst_monotonic_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_wavefront_one_as_seq_cst_monotonic_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -7674,6 +9288,30 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_acquire_cmpxchg(
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_wavefront_one_as_monotonic_acquire_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_wavefront_one_as_monotonic_acquire_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -7783,6 +9421,30 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_acquire_cmpxchg(
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_wavefront_one_as_acquire_acquire_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_wavefront_one_as_acquire_acquire_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -7892,6 +9554,30 @@ define amdgpu_kernel void @global_wavefront_one_as_release_acquire_cmpxchg(
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_wavefront_one_as_release_acquire_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_wavefront_one_as_release_acquire_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -8001,6 +9687,30 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_acquire_cmpxchg(
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_wavefront_one_as_acq_rel_acquire_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_wavefront_one_as_acq_rel_acquire_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -8110,6 +9820,30 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_acquire_cmpxchg(
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_wavefront_one_as_seq_cst_acquire_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_wavefront_one_as_seq_cst_acquire_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -8219,6 +9953,30 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_seq_cst_cmpxchg(
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_wavefront_one_as_monotonic_seq_cst_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_wavefront_one_as_monotonic_seq_cst_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -8328,6 +10086,30 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_seq_cst_cmpxchg(
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_wavefront_one_as_acquire_seq_cst_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_wavefront_one_as_acquire_seq_cst_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -8437,6 +10219,30 @@ define amdgpu_kernel void @global_wavefront_one_as_release_seq_cst_cmpxchg(
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_wavefront_one_as_release_seq_cst_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_wavefront_one_as_release_seq_cst_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -8546,6 +10352,30 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_seq_cst_cmpxchg(
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_wavefront_one_as_acq_rel_seq_cst_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_wavefront_one_as_acq_rel_seq_cst_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -8655,6 +10485,30 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_seq_cst_cmpxchg(
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_wavefront_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_wavefront_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -8784,6 +10638,34 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_monotonic_ret_cmpxc
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_wavefront_one_as_monotonic_monotonic_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_wavefront_one_as_monotonic_monotonic_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -8915,6 +10797,34 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_monotonic_ret_cmpxchg
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_wavefront_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_wavefront_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -9046,6 +10956,34 @@ define amdgpu_kernel void @global_wavefront_one_as_release_monotonic_ret_cmpxchg
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_wavefront_one_as_release_monotonic_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_wavefront_one_as_release_monotonic_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -9177,6 +11115,34 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -9308,6 +11274,34 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -9439,6 +11433,34 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_acquire_ret_cmpxchg
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_wavefront_one_as_monotonic_acquire_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_wavefront_one_as_monotonic_acquire_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -9570,6 +11592,34 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_acquire_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_wavefront_one_as_acquire_acquire_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_wavefront_one_as_acquire_acquire_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -9701,6 +11751,34 @@ define amdgpu_kernel void @global_wavefront_one_as_release_acquire_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_wavefront_one_as_release_acquire_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_wavefront_one_as_release_acquire_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -9832,6 +11910,34 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_acquire_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_wavefront_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_wavefront_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -9963,6 +12069,34 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_acquire_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_wavefront_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_wavefront_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -10094,6 +12228,34 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -10225,6 +12387,34 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_seq_cst_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_wavefront_one_as_acquire_seq_cst_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_wavefront_one_as_acquire_seq_cst_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -10356,6 +12546,34 @@ define amdgpu_kernel void @global_wavefront_one_as_release_seq_cst_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_wavefront_one_as_release_seq_cst_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_wavefront_one_as_release_seq_cst_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -10487,6 +12705,34 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -10618,6 +12864,34 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll
index 7fa48739f63b..a849e583cb61 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll
@@ -8,6 +8,8 @@
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=+tgsplit -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-TGSPLIT %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-NOTTGSPLIT %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -mattr=+tgsplit -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-TGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11-WGP %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+cumode -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11-CU %s
define amdgpu_kernel void @global_workgroup_unordered_load(
; GFX6-LABEL: global_workgroup_unordered_load:
@@ -112,6 +114,26 @@ define amdgpu_kernel void @global_workgroup_unordered_load(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_workgroup_unordered_load:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[0:1]
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[2:3]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_workgroup_unordered_load:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: global_load_b32 v1, v0, s[0:1]
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: global_store_b32 v0, v1, s[2:3]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
entry:
%val = load atomic i32, i32 addrspace(1)* %in syncscope("workgroup") unordered, align 4
@@ -222,6 +244,26 @@ define amdgpu_kernel void @global_workgroup_monotonic_load(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_workgroup_monotonic_load:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[0:1] glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[2:3]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_workgroup_monotonic_load:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: global_load_b32 v1, v0, s[0:1]
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: global_store_b32 v0, v1, s[2:3]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
entry:
%val = load atomic i32, i32 addrspace(1)* %in syncscope("workgroup") monotonic, align 4
@@ -335,6 +377,27 @@ define amdgpu_kernel void @global_workgroup_acquire_load(
; GFX940-TGSPLIT-NEXT: buffer_inv sc0
; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_workgroup_acquire_load:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[0:1] glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[2:3]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_workgroup_acquire_load:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: global_load_b32 v1, v0, s[0:1]
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: global_store_b32 v0, v1, s[2:3]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
entry:
%val = load atomic i32, i32 addrspace(1)* %in syncscope("workgroup") acquire, align 4
@@ -452,6 +515,28 @@ define amdgpu_kernel void @global_workgroup_seq_cst_load(
; GFX940-TGSPLIT-NEXT: buffer_inv sc0
; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_workgroup_seq_cst_load:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[0:1] glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[2:3]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_workgroup_seq_cst_load:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: global_load_b32 v1, v0, s[0:1]
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: global_store_b32 v0, v1, s[2:3]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
entry:
%val = load atomic i32, i32 addrspace(1)* %in syncscope("workgroup") seq_cst, align 4
@@ -554,6 +639,28 @@ define amdgpu_kernel void @global_workgroup_unordered_store(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_workgroup_unordered_store:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_workgroup_unordered_store:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 %in, i32 addrspace(1)* %out) {
entry:
store atomic i32 %in, i32 addrspace(1)* %out syncscope("workgroup") unordered, align 4
@@ -655,6 +762,28 @@ define amdgpu_kernel void @global_workgroup_monotonic_store(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] sc0
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_workgroup_monotonic_store:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_workgroup_monotonic_store:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 %in, i32 addrspace(1)* %out) {
entry:
store atomic i32 %in, i32 addrspace(1)* %out syncscope("workgroup") monotonic, align 4
@@ -766,6 +895,31 @@ define amdgpu_kernel void @global_workgroup_release_store(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] sc0
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_workgroup_release_store:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_workgroup_release_store:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 %in, i32 addrspace(1)* %out) {
entry:
store atomic i32 %in, i32 addrspace(1)* %out syncscope("workgroup") release, align 4
@@ -877,6 +1031,31 @@ define amdgpu_kernel void @global_workgroup_seq_cst_store(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] sc0
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_workgroup_seq_cst_store:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_workgroup_seq_cst_store:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 %in, i32 addrspace(1)* %out) {
entry:
store atomic i32 %in, i32 addrspace(1)* %out syncscope("workgroup") seq_cst, align 4
@@ -978,6 +1157,28 @@ define amdgpu_kernel void @global_workgroup_monotonic_atomicrmw(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
; GFX940-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[2:3]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_workgroup_monotonic_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_workgroup_monotonic_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("workgroup") monotonic
@@ -1085,6 +1286,30 @@ define amdgpu_kernel void @global_workgroup_acquire_atomicrmw(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc0
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_workgroup_acquire_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_workgroup_acquire_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("workgroup") acquire
@@ -1196,6 +1421,31 @@ define amdgpu_kernel void @global_workgroup_release_atomicrmw(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[2:3]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_workgroup_release_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_workgroup_release_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("workgroup") release
@@ -1313,6 +1563,33 @@ define amdgpu_kernel void @global_workgroup_acq_rel_atomicrmw(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc0
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_workgroup_acq_rel_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_workgroup_acq_rel_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("workgroup") acq_rel
@@ -1430,6 +1707,33 @@ define amdgpu_kernel void @global_workgroup_seq_cst_atomicrmw(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc0
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_workgroup_seq_cst_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_workgroup_seq_cst_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("workgroup") seq_cst
@@ -1552,6 +1856,33 @@ define amdgpu_kernel void @global_workgroup_acquire_ret_atomicrmw(
; GFX940-TGSPLIT-NEXT: buffer_inv sc0
; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_workgroup_acquire_ret_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_workgroup_acquire_ret_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("workgroup") acquire
@@ -1685,6 +2016,36 @@ define amdgpu_kernel void @global_workgroup_acq_rel_ret_atomicrmw(
; GFX940-TGSPLIT-NEXT: buffer_inv sc0
; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_workgroup_acq_rel_ret_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_workgroup_acq_rel_ret_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("workgroup") acq_rel
@@ -1818,6 +2179,36 @@ define amdgpu_kernel void @global_workgroup_seq_cst_ret_atomicrmw(
; GFX940-TGSPLIT-NEXT: buffer_inv sc0
; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_workgroup_seq_cst_ret_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_workgroup_seq_cst_ret_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("workgroup") seq_cst
@@ -1927,6 +2318,30 @@ define amdgpu_kernel void @global_workgroup_monotonic_monotonic_cmpxchg(
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_workgroup_monotonic_monotonic_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_workgroup_monotonic_monotonic_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -2042,6 +2457,32 @@ define amdgpu_kernel void @global_workgroup_acquire_monotonic_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc0
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_workgroup_acquire_monotonic_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_workgroup_acquire_monotonic_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -2161,6 +2602,33 @@ define amdgpu_kernel void @global_workgroup_release_monotonic_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_workgroup_release_monotonic_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_workgroup_release_monotonic_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -2286,6 +2754,35 @@ define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc0
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_workgroup_acq_rel_monotonic_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_workgroup_acq_rel_monotonic_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -2411,6 +2908,35 @@ define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc0
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_workgroup_seq_cst_monotonic_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_workgroup_seq_cst_monotonic_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -2526,6 +3052,32 @@ define amdgpu_kernel void @global_workgroup_monotonic_acquire_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc0
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_workgroup_monotonic_acquire_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_workgroup_monotonic_acquire_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -2641,6 +3193,32 @@ define amdgpu_kernel void @global_workgroup_acquire_acquire_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc0
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_workgroup_acquire_acquire_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_workgroup_acquire_acquire_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -2766,6 +3344,35 @@ define amdgpu_kernel void @global_workgroup_release_acquire_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc0
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_workgroup_release_acquire_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_workgroup_release_acquire_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -2891,6 +3498,35 @@ define amdgpu_kernel void @global_workgroup_acq_rel_acquire_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc0
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_workgroup_acq_rel_acquire_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_workgroup_acq_rel_acquire_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -3016,6 +3652,35 @@ define amdgpu_kernel void @global_workgroup_seq_cst_acquire_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc0
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_workgroup_seq_cst_acquire_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_workgroup_seq_cst_acquire_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -3141,6 +3806,35 @@ define amdgpu_kernel void @global_workgroup_monotonic_seq_cst_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc0
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_workgroup_monotonic_seq_cst_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_workgroup_monotonic_seq_cst_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -3266,6 +3960,35 @@ define amdgpu_kernel void @global_workgroup_acquire_seq_cst_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc0
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_workgroup_acquire_seq_cst_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_workgroup_acquire_seq_cst_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -3391,6 +4114,35 @@ define amdgpu_kernel void @global_workgroup_release_seq_cst_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc0
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_workgroup_release_seq_cst_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_workgroup_release_seq_cst_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -3516,6 +4268,35 @@ define amdgpu_kernel void @global_workgroup_acq_rel_seq_cst_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc0
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_workgroup_acq_rel_seq_cst_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_workgroup_acq_rel_seq_cst_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -3641,6 +4422,35 @@ define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc0
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_workgroup_seq_cst_seq_cst_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_workgroup_seq_cst_seq_cst_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -3770,6 +4580,34 @@ define amdgpu_kernel void @global_workgroup_monotonic_monotonic_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_workgroup_monotonic_monotonic_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_workgroup_monotonic_monotonic_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -3904,6 +4742,35 @@ define amdgpu_kernel void @global_workgroup_acquire_monotonic_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: buffer_inv sc0
; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_workgroup_acquire_monotonic_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_workgroup_acquire_monotonic_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -4045,6 +4912,37 @@ define amdgpu_kernel void @global_workgroup_release_monotonic_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_workgroup_release_monotonic_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_workgroup_release_monotonic_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -4189,6 +5087,38 @@ define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: buffer_inv sc0
; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_workgroup_acq_rel_monotonic_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_workgroup_acq_rel_monotonic_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -4333,6 +5263,38 @@ define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: buffer_inv sc0
; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_workgroup_seq_cst_monotonic_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_workgroup_seq_cst_monotonic_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -4467,6 +5429,35 @@ define amdgpu_kernel void @global_workgroup_monotonic_acquire_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: buffer_inv sc0
; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_workgroup_monotonic_acquire_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_workgroup_monotonic_acquire_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -4601,6 +5592,35 @@ define amdgpu_kernel void @global_workgroup_acquire_acquire_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: buffer_inv sc0
; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_workgroup_acquire_acquire_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_workgroup_acquire_acquire_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -4745,6 +5765,38 @@ define amdgpu_kernel void @global_workgroup_release_acquire_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: buffer_inv sc0
; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_workgroup_release_acquire_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_workgroup_release_acquire_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -4889,6 +5941,38 @@ define amdgpu_kernel void @global_workgroup_acq_rel_acquire_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: buffer_inv sc0
; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_workgroup_acq_rel_acquire_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_workgroup_acq_rel_acquire_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -5033,6 +6117,38 @@ define amdgpu_kernel void @global_workgroup_seq_cst_acquire_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: buffer_inv sc0
; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_workgroup_seq_cst_acquire_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_workgroup_seq_cst_acquire_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -5177,6 +6293,38 @@ define amdgpu_kernel void @global_workgroup_monotonic_seq_cst_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: buffer_inv sc0
; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_workgroup_monotonic_seq_cst_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_workgroup_monotonic_seq_cst_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -5321,6 +6469,38 @@ define amdgpu_kernel void @global_workgroup_acquire_seq_cst_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: buffer_inv sc0
; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_workgroup_acquire_seq_cst_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_workgroup_acquire_seq_cst_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -5465,6 +6645,38 @@ define amdgpu_kernel void @global_workgroup_release_seq_cst_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: buffer_inv sc0
; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_workgroup_release_seq_cst_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_workgroup_release_seq_cst_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -5609,6 +6821,38 @@ define amdgpu_kernel void @global_workgroup_acq_rel_seq_cst_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: buffer_inv sc0
; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_workgroup_acq_rel_seq_cst_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_workgroup_acq_rel_seq_cst_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -5753,6 +6997,38 @@ define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: buffer_inv sc0
; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_workgroup_seq_cst_seq_cst_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_workgroup_seq_cst_seq_cst_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -5865,6 +7141,26 @@ define amdgpu_kernel void @global_workgroup_one_as_unordered_load(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_workgroup_one_as_unordered_load:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[0:1]
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[2:3]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_workgroup_one_as_unordered_load:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: global_load_b32 v1, v0, s[0:1]
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: global_store_b32 v0, v1, s[2:3]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
entry:
%val = load atomic i32, i32 addrspace(1)* %in syncscope("workgroup-one-as") unordered, align 4
@@ -5975,6 +7271,26 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_load(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_workgroup_one_as_monotonic_load:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[0:1] glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[2:3]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_workgroup_one_as_monotonic_load:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: global_load_b32 v1, v0, s[0:1]
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: global_store_b32 v0, v1, s[2:3]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
entry:
%val = load atomic i32, i32 addrspace(1)* %in syncscope("workgroup-one-as") monotonic, align 4
@@ -6088,6 +7404,27 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_load(
; GFX940-TGSPLIT-NEXT: buffer_inv sc0
; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_workgroup_one_as_acquire_load:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[0:1] glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[2:3]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_workgroup_one_as_acquire_load:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: global_load_b32 v1, v0, s[0:1]
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: global_store_b32 v0, v1, s[2:3]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
entry:
%val = load atomic i32, i32 addrspace(1)* %in syncscope("workgroup-one-as") acquire, align 4
@@ -6202,6 +7539,28 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_load(
; GFX940-TGSPLIT-NEXT: buffer_inv sc0
; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_workgroup_one_as_seq_cst_load:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[0:1] glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[2:3]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_workgroup_one_as_seq_cst_load:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: global_load_b32 v1, v0, s[0:1]
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: global_store_b32 v0, v1, s[2:3]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
entry:
%val = load atomic i32, i32 addrspace(1)* %in syncscope("workgroup-one-as") seq_cst, align 4
@@ -6304,6 +7663,28 @@ define amdgpu_kernel void @global_workgroup_one_as_unordered_store(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_workgroup_one_as_unordered_store:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_workgroup_one_as_unordered_store:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 %in, i32 addrspace(1)* %out) {
entry:
store atomic i32 %in, i32 addrspace(1)* %out syncscope("workgroup-one-as") unordered, align 4
@@ -6405,6 +7786,28 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_store(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] sc0
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_workgroup_one_as_monotonic_store:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_workgroup_one_as_monotonic_store:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 %in, i32 addrspace(1)* %out) {
entry:
store atomic i32 %in, i32 addrspace(1)* %out syncscope("workgroup-one-as") monotonic, align 4
@@ -6510,6 +7913,30 @@ define amdgpu_kernel void @global_workgroup_one_as_release_store(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] sc0
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_workgroup_one_as_release_store:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_workgroup_one_as_release_store:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 %in, i32 addrspace(1)* %out) {
entry:
store atomic i32 %in, i32 addrspace(1)* %out syncscope("workgroup-one-as") release, align 4
@@ -6615,6 +8042,30 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_store(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3] sc0
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_workgroup_one_as_seq_cst_store:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_workgroup_one_as_seq_cst_store:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 %in, i32 addrspace(1)* %out) {
entry:
store atomic i32 %in, i32 addrspace(1)* %out syncscope("workgroup-one-as") seq_cst, align 4
@@ -6716,6 +8167,28 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_atomicrmw(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s4
; GFX940-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[2:3]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_workgroup_one_as_monotonic_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_workgroup_one_as_monotonic_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("workgroup-one-as") monotonic
@@ -6823,6 +8296,30 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_atomicrmw(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc0
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_workgroup_one_as_acquire_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_workgroup_one_as_acquire_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("workgroup-one-as") acquire
@@ -6928,6 +8425,30 @@ define amdgpu_kernel void @global_workgroup_one_as_release_atomicrmw(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: global_atomic_swap v0, v1, s[2:3]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_workgroup_one_as_release_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_workgroup_one_as_release_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("workgroup-one-as") release
@@ -7039,6 +8560,32 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_atomicrmw(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc0
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_workgroup_one_as_acq_rel_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_workgroup_one_as_acq_rel_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("workgroup-one-as") acq_rel
@@ -7150,6 +8697,32 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_atomicrmw(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc0
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_workgroup_one_as_seq_cst_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_workgroup_one_as_seq_cst_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("workgroup-one-as") seq_cst
@@ -7272,6 +8845,33 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_ret_atomicrmw(
; GFX940-TGSPLIT-NEXT: buffer_inv sc0
; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_workgroup_one_as_acquire_ret_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_workgroup_one_as_acquire_ret_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("workgroup-one-as") acquire
@@ -7399,6 +8999,35 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_ret_atomicrmw(
; GFX940-TGSPLIT-NEXT: buffer_inv sc0
; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_workgroup_one_as_acq_rel_ret_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_workgroup_one_as_acq_rel_ret_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("workgroup-one-as") acq_rel
@@ -7526,6 +9155,35 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_ret_atomicrmw(
; GFX940-TGSPLIT-NEXT: buffer_inv sc0
; GFX940-TGSPLIT-NEXT: global_store_dword v0, v1, s[2:3]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_workgroup_one_as_seq_cst_ret_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_workgroup_one_as_seq_cst_ret_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in syncscope("workgroup-one-as") seq_cst
@@ -7635,6 +9293,30 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_monotonic_cmpxchg(
; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_workgroup_one_as_monotonic_monotonic_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_workgroup_one_as_monotonic_monotonic_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -7750,6 +9432,32 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_monotonic_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc0
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_workgroup_one_as_acquire_monotonic_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_workgroup_one_as_acquire_monotonic_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -7863,6 +9571,32 @@ define amdgpu_kernel void @global_workgroup_one_as_release_monotonic_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: global_atomic_cmpswap v2, v[0:1], s[4:5] offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_workgroup_one_as_release_monotonic_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_workgroup_one_as_release_monotonic_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -7982,6 +9716,34 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_monotonic_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc0
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_workgroup_one_as_acq_rel_monotonic_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_workgroup_one_as_acq_rel_monotonic_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -8101,6 +9863,34 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_monotonic_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc0
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_workgroup_one_as_seq_cst_monotonic_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_workgroup_one_as_seq_cst_monotonic_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -8216,6 +10006,32 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_acquire_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc0
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_workgroup_one_as_monotonic_acquire_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_workgroup_one_as_monotonic_acquire_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -8331,6 +10147,32 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_acquire_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc0
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_workgroup_one_as_acquire_acquire_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_workgroup_one_as_acquire_acquire_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -8450,6 +10292,34 @@ define amdgpu_kernel void @global_workgroup_one_as_release_acquire_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc0
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_workgroup_one_as_release_acquire_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_workgroup_one_as_release_acquire_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -8569,6 +10439,34 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_acquire_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc0
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_workgroup_one_as_acq_rel_acquire_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_workgroup_one_as_acq_rel_acquire_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -8688,6 +10586,34 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_acquire_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc0
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_workgroup_one_as_seq_cst_acquire_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_workgroup_one_as_seq_cst_acquire_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -8807,6 +10733,34 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_seq_cst_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc0
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_workgroup_one_as_monotonic_seq_cst_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_workgroup_one_as_monotonic_seq_cst_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -8926,6 +10880,34 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_seq_cst_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc0
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_workgroup_one_as_acquire_seq_cst_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_workgroup_one_as_acquire_seq_cst_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -9045,6 +11027,34 @@ define amdgpu_kernel void @global_workgroup_one_as_release_seq_cst_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc0
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_workgroup_one_as_release_seq_cst_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_workgroup_one_as_release_seq_cst_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -9164,6 +11174,34 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_seq_cst_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc0
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_workgroup_one_as_acq_rel_seq_cst_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_workgroup_one_as_acq_rel_seq_cst_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -9283,6 +11321,34 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_seq_cst_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: buffer_inv sc0
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -9412,6 +11478,34 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_monotonic_ret_cmpxc
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_workgroup_one_as_monotonic_monotonic_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_workgroup_one_as_monotonic_monotonic_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -9546,6 +11640,35 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_monotonic_ret_cmpxchg
; GFX940-TGSPLIT-NEXT: buffer_inv sc0
; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_workgroup_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_workgroup_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -9681,6 +11804,36 @@ define amdgpu_kernel void @global_workgroup_one_as_release_monotonic_ret_cmpxchg
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_workgroup_one_as_release_monotonic_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_workgroup_one_as_release_monotonic_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -9819,6 +11972,37 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg
; GFX940-TGSPLIT-NEXT: buffer_inv sc0
; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -9957,6 +12141,37 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg
; GFX940-TGSPLIT-NEXT: buffer_inv sc0
; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -10091,6 +12306,35 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_acquire_ret_cmpxchg
; GFX940-TGSPLIT-NEXT: buffer_inv sc0
; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_workgroup_one_as_monotonic_acquire_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_workgroup_one_as_monotonic_acquire_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -10225,6 +12469,35 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_acquire_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: buffer_inv sc0
; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_workgroup_one_as_acquire_acquire_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_workgroup_one_as_acquire_acquire_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -10363,6 +12636,37 @@ define amdgpu_kernel void @global_workgroup_one_as_release_acquire_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: buffer_inv sc0
; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_workgroup_one_as_release_acquire_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_workgroup_one_as_release_acquire_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -10501,6 +12805,37 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: buffer_inv sc0
; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -10639,6 +12974,37 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: buffer_inv sc0
; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -10777,6 +13143,37 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg
; GFX940-TGSPLIT-NEXT: buffer_inv sc0
; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -10915,6 +13312,37 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_seq_cst_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: buffer_inv sc0
; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_workgroup_one_as_acquire_seq_cst_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_workgroup_one_as_acquire_seq_cst_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -11053,6 +13481,37 @@ define amdgpu_kernel void @global_workgroup_one_as_release_seq_cst_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: buffer_inv sc0
; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_workgroup_one_as_release_seq_cst_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_workgroup_one_as_release_seq_cst_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -11191,6 +13650,37 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: buffer_inv sc0
; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
@@ -11329,6 +13819,37 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: buffer_inv sc0
; GFX940-TGSPLIT-NEXT: global_store_dword v2, v0, s[4:5]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: global_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: global_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(1)* %out, i32 4
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-agent.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-agent.ll
index c0dcb96de35a..e6efd9b8131b 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-agent.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-agent.ll
@@ -8,6 +8,8 @@
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=+tgsplit -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-TGSPLIT %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-NOTTGSPLIT %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -mattr=+tgsplit -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-TGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11-WGP %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+cumode -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11-CU %s
define amdgpu_kernel void @local_agent_unordered_load(
; GFX6-LABEL: local_agent_unordered_load:
@@ -112,6 +114,28 @@ define amdgpu_kernel void @local_agent_unordered_load(
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: ds_write_b32 v1, v0
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_agent_unordered_load:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: ds_load_b32 v0, v0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: ds_store_b32 v1, v0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_agent_unordered_load:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: ds_load_b32 v0, v0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_store_b32 v1, v0
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
entry:
%val = load atomic i32, i32 addrspace(3)* %in syncscope("agent") unordered, align 4
@@ -222,6 +246,28 @@ define amdgpu_kernel void @local_agent_monotonic_load(
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: ds_write_b32 v1, v0
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_agent_monotonic_load:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: ds_load_b32 v0, v0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: ds_store_b32 v1, v0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_agent_monotonic_load:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: ds_load_b32 v0, v0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_store_b32 v1, v0
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
entry:
%val = load atomic i32, i32 addrspace(3)* %in syncscope("agent") monotonic, align 4
@@ -335,6 +381,29 @@ define amdgpu_kernel void @local_agent_acquire_load(
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: ds_write_b32 v1, v0
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_agent_acquire_load:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: ds_load_b32 v0, v0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: ds_store_b32 v1, v0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_agent_acquire_load:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: ds_load_b32 v0, v0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_store_b32 v1, v0
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
entry:
%val = load atomic i32, i32 addrspace(3)* %in syncscope("agent") acquire, align 4
@@ -458,6 +527,32 @@ define amdgpu_kernel void @local_agent_seq_cst_load(
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: ds_write_b32 v1, v0
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_agent_seq_cst_load:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: ds_load_b32 v0, v0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: ds_store_b32 v1, v0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_agent_seq_cst_load:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_load_b32 v0, v0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_store_b32 v1, v0
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
entry:
%val = load atomic i32, i32 addrspace(3)* %in syncscope("agent") seq_cst, align 4
@@ -550,6 +645,24 @@ define amdgpu_kernel void @local_agent_unordered_store(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0
; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_agent_unordered_store:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0
+; GFX11-WGP-NEXT: ds_store_b32 v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_agent_unordered_store:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0
+; GFX11-CU-NEXT: ds_store_b32 v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 %in, i32 addrspace(3)* %out) {
entry:
store atomic i32 %in, i32 addrspace(3)* %out syncscope("agent") unordered, align 4
@@ -641,6 +754,24 @@ define amdgpu_kernel void @local_agent_monotonic_store(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0
; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_agent_monotonic_store:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0
+; GFX11-WGP-NEXT: ds_store_b32 v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_agent_monotonic_store:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0
+; GFX11-CU-NEXT: ds_store_b32 v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 %in, i32 addrspace(3)* %out) {
entry:
store atomic i32 %in, i32 addrspace(3)* %out syncscope("agent") monotonic, align 4
@@ -742,6 +873,27 @@ define amdgpu_kernel void @local_agent_release_store(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_agent_release_store:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: ds_store_b32 v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_agent_release_store:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_store_b32 v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 %in, i32 addrspace(3)* %out) {
entry:
store atomic i32 %in, i32 addrspace(3)* %out syncscope("agent") release, align 4
@@ -843,6 +995,27 @@ define amdgpu_kernel void @local_agent_seq_cst_store(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_agent_seq_cst_store:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: ds_store_b32 v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_agent_seq_cst_store:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_store_b32 v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 %in, i32 addrspace(3)* %out) {
entry:
store atomic i32 %in, i32 addrspace(3)* %out syncscope("agent") seq_cst, align 4
@@ -934,6 +1107,24 @@ define amdgpu_kernel void @local_agent_monotonic_atomicrmw(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
; GFX940-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_agent_monotonic_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_agent_monotonic_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("agent") monotonic
@@ -1035,6 +1226,27 @@ define amdgpu_kernel void @local_agent_acquire_atomicrmw(
; GFX940-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
; GFX940-TGSPLIT-NEXT: buffer_inv sc0
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_agent_acquire_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_agent_acquire_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("agent") acquire
@@ -1136,6 +1348,27 @@ define amdgpu_kernel void @local_agent_release_atomicrmw(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_agent_release_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_agent_release_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("agent") release
@@ -1247,6 +1480,30 @@ define amdgpu_kernel void @local_agent_acq_rel_atomicrmw(
; GFX940-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
; GFX940-TGSPLIT-NEXT: buffer_inv sc0
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_agent_acq_rel_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_agent_acq_rel_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("agent") acq_rel
@@ -1358,6 +1615,30 @@ define amdgpu_kernel void @local_agent_seq_cst_atomicrmw(
; GFX940-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
; GFX940-TGSPLIT-NEXT: buffer_inv sc0
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_agent_seq_cst_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_agent_seq_cst_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("agent") seq_cst
@@ -1470,6 +1751,29 @@ define amdgpu_kernel void @local_agent_acquire_ret_atomicrmw(
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_agent_acquire_ret_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v1, v0, v1
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: ds_store_b32 v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_agent_acquire_ret_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_store_b32 v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("agent") acquire
@@ -1593,6 +1897,32 @@ define amdgpu_kernel void @local_agent_acq_rel_ret_atomicrmw(
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_agent_acq_rel_ret_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v1, v0, v1
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: ds_store_b32 v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_agent_acq_rel_ret_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_store_b32 v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("agent") acq_rel
@@ -1716,6 +2046,32 @@ define amdgpu_kernel void @local_agent_seq_cst_ret_atomicrmw(
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_agent_seq_cst_ret_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v1, v0, v1
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: ds_store_b32 v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_agent_seq_cst_ret_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_store_b32 v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("agent") seq_cst
@@ -1817,6 +2173,26 @@ define amdgpu_kernel void @local_agent_monotonic_monotonic_cmpxchg(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_agent_monotonic_monotonic_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_agent_monotonic_monotonic_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -1928,6 +2304,29 @@ define amdgpu_kernel void @local_agent_acquire_monotonic_cmpxchg(
; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX940-TGSPLIT-NEXT: buffer_inv sc0
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_agent_acquire_monotonic_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_agent_acquire_monotonic_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -2039,6 +2438,29 @@ define amdgpu_kernel void @local_agent_release_monotonic_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_agent_release_monotonic_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_agent_release_monotonic_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -2160,6 +2582,32 @@ define amdgpu_kernel void @local_agent_acq_rel_monotonic_cmpxchg(
; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX940-TGSPLIT-NEXT: buffer_inv sc0
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_agent_acq_rel_monotonic_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_agent_acq_rel_monotonic_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -2281,6 +2729,32 @@ define amdgpu_kernel void @local_agent_seq_cst_monotonic_cmpxchg(
; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX940-TGSPLIT-NEXT: buffer_inv sc0
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_agent_seq_cst_monotonic_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_agent_seq_cst_monotonic_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -2392,6 +2866,29 @@ define amdgpu_kernel void @local_agent_monotonic_acquire_cmpxchg(
; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX940-TGSPLIT-NEXT: buffer_inv sc0
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_agent_monotonic_acquire_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_agent_monotonic_acquire_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -2503,6 +3000,29 @@ define amdgpu_kernel void @local_agent_acquire_acquire_cmpxchg(
; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX940-TGSPLIT-NEXT: buffer_inv sc0
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_agent_acquire_acquire_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_agent_acquire_acquire_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -2624,6 +3144,32 @@ define amdgpu_kernel void @local_agent_release_acquire_cmpxchg(
; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX940-TGSPLIT-NEXT: buffer_inv sc0
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_agent_release_acquire_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_agent_release_acquire_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -2745,6 +3291,32 @@ define amdgpu_kernel void @local_agent_acq_rel_acquire_cmpxchg(
; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX940-TGSPLIT-NEXT: buffer_inv sc0
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_agent_acq_rel_acquire_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_agent_acq_rel_acquire_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -2866,6 +3438,32 @@ define amdgpu_kernel void @local_agent_seq_cst_acquire_cmpxchg(
; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX940-TGSPLIT-NEXT: buffer_inv sc0
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_agent_seq_cst_acquire_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_agent_seq_cst_acquire_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -2987,6 +3585,32 @@ define amdgpu_kernel void @local_agent_monotonic_seq_cst_cmpxchg(
; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX940-TGSPLIT-NEXT: buffer_inv sc0
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_agent_monotonic_seq_cst_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_agent_monotonic_seq_cst_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -3108,6 +3732,32 @@ define amdgpu_kernel void @local_agent_acquire_seq_cst_cmpxchg(
; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX940-TGSPLIT-NEXT: buffer_inv sc0
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_agent_acquire_seq_cst_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_agent_acquire_seq_cst_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -3229,6 +3879,32 @@ define amdgpu_kernel void @local_agent_release_seq_cst_cmpxchg(
; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX940-TGSPLIT-NEXT: buffer_inv sc0
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_agent_release_seq_cst_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_agent_release_seq_cst_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -3350,6 +4026,32 @@ define amdgpu_kernel void @local_agent_acq_rel_seq_cst_cmpxchg(
; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX940-TGSPLIT-NEXT: buffer_inv sc0
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_agent_acq_rel_seq_cst_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_agent_acq_rel_seq_cst_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -3471,6 +4173,32 @@ define amdgpu_kernel void @local_agent_seq_cst_seq_cst_cmpxchg(
; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX940-TGSPLIT-NEXT: buffer_inv sc0
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_agent_seq_cst_seq_cst_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_agent_seq_cst_seq_cst_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -3590,6 +4318,30 @@ define amdgpu_kernel void @local_agent_monotonic_monotonic_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_agent_monotonic_monotonic_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: ds_store_b32 v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_agent_monotonic_monotonic_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_store_b32 v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -3714,6 +4466,31 @@ define amdgpu_kernel void @local_agent_acquire_monotonic_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_agent_acquire_monotonic_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: ds_store_b32 v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_agent_acquire_monotonic_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_store_b32 v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -3845,6 +4622,33 @@ define amdgpu_kernel void @local_agent_release_monotonic_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_agent_release_monotonic_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: ds_store_b32 v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_agent_release_monotonic_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_store_b32 v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -3979,6 +4783,34 @@ define amdgpu_kernel void @local_agent_acq_rel_monotonic_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_agent_acq_rel_monotonic_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: ds_store_b32 v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_agent_acq_rel_monotonic_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_store_b32 v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -4113,6 +4945,34 @@ define amdgpu_kernel void @local_agent_seq_cst_monotonic_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_agent_seq_cst_monotonic_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: ds_store_b32 v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_agent_seq_cst_monotonic_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_store_b32 v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -4237,6 +5097,31 @@ define amdgpu_kernel void @local_agent_monotonic_acquire_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_agent_monotonic_acquire_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: ds_store_b32 v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_agent_monotonic_acquire_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_store_b32 v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -4361,6 +5246,31 @@ define amdgpu_kernel void @local_agent_acquire_acquire_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_agent_acquire_acquire_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: ds_store_b32 v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_agent_acquire_acquire_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_store_b32 v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -4495,6 +5405,34 @@ define amdgpu_kernel void @local_agent_release_acquire_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_agent_release_acquire_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: ds_store_b32 v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_agent_release_acquire_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_store_b32 v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -4629,6 +5567,34 @@ define amdgpu_kernel void @local_agent_acq_rel_acquire_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_agent_acq_rel_acquire_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: ds_store_b32 v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_agent_acq_rel_acquire_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_store_b32 v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -4763,6 +5729,34 @@ define amdgpu_kernel void @local_agent_seq_cst_acquire_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_agent_seq_cst_acquire_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: ds_store_b32 v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_agent_seq_cst_acquire_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_store_b32 v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -4897,6 +5891,34 @@ define amdgpu_kernel void @local_agent_monotonic_seq_cst_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_agent_monotonic_seq_cst_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: ds_store_b32 v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_agent_monotonic_seq_cst_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_store_b32 v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -5031,6 +6053,34 @@ define amdgpu_kernel void @local_agent_acquire_seq_cst_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_agent_acquire_seq_cst_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: ds_store_b32 v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_agent_acquire_seq_cst_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_store_b32 v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -5165,6 +6215,34 @@ define amdgpu_kernel void @local_agent_release_seq_cst_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_agent_release_seq_cst_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: ds_store_b32 v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_agent_release_seq_cst_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_store_b32 v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -5299,6 +6377,34 @@ define amdgpu_kernel void @local_agent_acq_rel_seq_cst_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_agent_acq_rel_seq_cst_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: ds_store_b32 v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_agent_acq_rel_seq_cst_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_store_b32 v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -5433,6 +6539,34 @@ define amdgpu_kernel void @local_agent_seq_cst_seq_cst_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_agent_seq_cst_seq_cst_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: ds_store_b32 v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_agent_seq_cst_seq_cst_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_store_b32 v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -5545,6 +6679,28 @@ define amdgpu_kernel void @local_agent_one_as_unordered_load(
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: ds_write_b32 v1, v0
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_agent_one_as_unordered_load:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: ds_load_b32 v0, v0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: ds_store_b32 v1, v0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_agent_one_as_unordered_load:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: ds_load_b32 v0, v0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_store_b32 v1, v0
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
entry:
%val = load atomic i32, i32 addrspace(3)* %in syncscope("agent-one-as") unordered, align 4
@@ -5655,6 +6811,28 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_load(
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: ds_write_b32 v1, v0
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_agent_one_as_monotonic_load:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: ds_load_b32 v0, v0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: ds_store_b32 v1, v0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_agent_one_as_monotonic_load:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: ds_load_b32 v0, v0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_store_b32 v1, v0
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
entry:
%val = load atomic i32, i32 addrspace(3)* %in syncscope("agent-one-as") monotonic, align 4
@@ -5765,6 +6943,28 @@ define amdgpu_kernel void @local_agent_one_as_acquire_load(
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: ds_write_b32 v1, v0
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_agent_one_as_acquire_load:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: ds_load_b32 v0, v0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: ds_store_b32 v1, v0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_agent_one_as_acquire_load:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: ds_load_b32 v0, v0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_store_b32 v1, v0
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
entry:
%val = load atomic i32, i32 addrspace(3)* %in syncscope("agent-one-as") acquire, align 4
@@ -5875,6 +7075,28 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_load(
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: ds_write_b32 v1, v0
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_agent_one_as_seq_cst_load:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: ds_load_b32 v0, v0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: ds_store_b32 v1, v0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_agent_one_as_seq_cst_load:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: ds_load_b32 v0, v0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_store_b32 v1, v0
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
entry:
%val = load atomic i32, i32 addrspace(3)* %in syncscope("agent-one-as") seq_cst, align 4
@@ -5967,6 +7189,24 @@ define amdgpu_kernel void @local_agent_one_as_unordered_store(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0
; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_agent_one_as_unordered_store:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0
+; GFX11-WGP-NEXT: ds_store_b32 v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_agent_one_as_unordered_store:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0
+; GFX11-CU-NEXT: ds_store_b32 v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 %in, i32 addrspace(3)* %out) {
entry:
store atomic i32 %in, i32 addrspace(3)* %out syncscope("agent-one-as") unordered, align 4
@@ -6058,6 +7298,24 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_store(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0
; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_agent_one_as_monotonic_store:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0
+; GFX11-WGP-NEXT: ds_store_b32 v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_agent_one_as_monotonic_store:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0
+; GFX11-CU-NEXT: ds_store_b32 v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 %in, i32 addrspace(3)* %out) {
entry:
store atomic i32 %in, i32 addrspace(3)* %out syncscope("agent-one-as") monotonic, align 4
@@ -6149,6 +7407,24 @@ define amdgpu_kernel void @local_agent_one_as_release_store(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0
; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_agent_one_as_release_store:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0
+; GFX11-WGP-NEXT: ds_store_b32 v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_agent_one_as_release_store:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0
+; GFX11-CU-NEXT: ds_store_b32 v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 %in, i32 addrspace(3)* %out) {
entry:
store atomic i32 %in, i32 addrspace(3)* %out syncscope("agent-one-as") release, align 4
@@ -6240,6 +7516,24 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_store(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0
; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_agent_one_as_seq_cst_store:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0
+; GFX11-WGP-NEXT: ds_store_b32 v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_agent_one_as_seq_cst_store:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0
+; GFX11-CU-NEXT: ds_store_b32 v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 %in, i32 addrspace(3)* %out) {
entry:
store atomic i32 %in, i32 addrspace(3)* %out syncscope("agent-one-as") seq_cst, align 4
@@ -6331,6 +7625,24 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_atomicrmw(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
; GFX940-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_agent_one_as_monotonic_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_agent_one_as_monotonic_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("agent-one-as") monotonic
@@ -6422,6 +7734,24 @@ define amdgpu_kernel void @local_agent_one_as_acquire_atomicrmw(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
; GFX940-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_agent_one_as_acquire_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_agent_one_as_acquire_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("agent-one-as") acquire
@@ -6513,6 +7843,24 @@ define amdgpu_kernel void @local_agent_one_as_release_atomicrmw(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
; GFX940-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_agent_one_as_release_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_agent_one_as_release_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("agent-one-as") release
@@ -6604,6 +7952,24 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_atomicrmw(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
; GFX940-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_agent_one_as_acq_rel_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_agent_one_as_acq_rel_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("agent-one-as") acq_rel
@@ -6695,6 +8061,24 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_atomicrmw(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
; GFX940-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_agent_one_as_seq_cst_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_agent_one_as_seq_cst_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("agent-one-as") seq_cst
@@ -6804,6 +8188,28 @@ define amdgpu_kernel void @local_agent_one_as_acquire_ret_atomicrmw(
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_agent_one_as_acquire_ret_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v1, v0, v1
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: ds_store_b32 v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_agent_one_as_acquire_ret_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_store_b32 v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("agent-one-as") acquire
@@ -6914,6 +8320,28 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_ret_atomicrmw(
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_agent_one_as_acq_rel_ret_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v1, v0, v1
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: ds_store_b32 v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_agent_one_as_acq_rel_ret_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_store_b32 v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("agent-one-as") acq_rel
@@ -7024,6 +8452,28 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_ret_atomicrmw(
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_agent_one_as_seq_cst_ret_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v1, v0, v1
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: ds_store_b32 v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_agent_one_as_seq_cst_ret_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_store_b32 v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("agent-one-as") seq_cst
@@ -7125,6 +8575,26 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_monotonic_cmpxchg(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_agent_one_as_monotonic_monotonic_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_agent_one_as_monotonic_monotonic_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -7226,6 +8696,26 @@ define amdgpu_kernel void @local_agent_one_as_acquire_monotonic_cmpxchg(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_agent_one_as_acquire_monotonic_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_agent_one_as_acquire_monotonic_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -7327,6 +8817,26 @@ define amdgpu_kernel void @local_agent_one_as_release_monotonic_cmpxchg(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_agent_one_as_release_monotonic_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_agent_one_as_release_monotonic_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -7428,6 +8938,26 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_monotonic_cmpxchg(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_agent_one_as_acq_rel_monotonic_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_agent_one_as_acq_rel_monotonic_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -7529,6 +9059,26 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_monotonic_cmpxchg(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_agent_one_as_seq_cst_monotonic_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_agent_one_as_seq_cst_monotonic_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -7630,6 +9180,26 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_acquire_cmpxchg(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_agent_one_as_monotonic_acquire_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_agent_one_as_monotonic_acquire_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -7731,6 +9301,26 @@ define amdgpu_kernel void @local_agent_one_as_acquire_acquire_cmpxchg(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_agent_one_as_acquire_acquire_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_agent_one_as_acquire_acquire_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -7832,6 +9422,26 @@ define amdgpu_kernel void @local_agent_one_as_release_acquire_cmpxchg(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_agent_one_as_release_acquire_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_agent_one_as_release_acquire_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -7933,6 +9543,26 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_acquire_cmpxchg(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_agent_one_as_acq_rel_acquire_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_agent_one_as_acq_rel_acquire_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -8034,6 +9664,26 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_acquire_cmpxchg(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_agent_one_as_seq_cst_acquire_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_agent_one_as_seq_cst_acquire_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -8135,6 +9785,26 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_seq_cst_cmpxchg(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_agent_one_as_monotonic_seq_cst_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_agent_one_as_monotonic_seq_cst_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -8236,6 +9906,26 @@ define amdgpu_kernel void @local_agent_one_as_acquire_seq_cst_cmpxchg(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_agent_one_as_acquire_seq_cst_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_agent_one_as_acquire_seq_cst_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -8337,6 +10027,26 @@ define amdgpu_kernel void @local_agent_one_as_release_seq_cst_cmpxchg(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_agent_one_as_release_seq_cst_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_agent_one_as_release_seq_cst_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -8438,6 +10148,26 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_seq_cst_cmpxchg(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_agent_one_as_acq_rel_seq_cst_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_agent_one_as_acq_rel_seq_cst_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -8539,6 +10269,26 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_seq_cst_cmpxchg(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_agent_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_agent_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -8658,6 +10408,30 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_monotonic_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_agent_one_as_monotonic_monotonic_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: ds_store_b32 v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_agent_one_as_monotonic_monotonic_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_store_b32 v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -8779,6 +10553,30 @@ define amdgpu_kernel void @local_agent_one_as_acquire_monotonic_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_agent_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: ds_store_b32 v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_agent_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_store_b32 v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -8900,6 +10698,30 @@ define amdgpu_kernel void @local_agent_one_as_release_monotonic_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_agent_one_as_release_monotonic_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: ds_store_b32 v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_agent_one_as_release_monotonic_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_store_b32 v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -9021,6 +10843,30 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_monotonic_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_agent_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: ds_store_b32 v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_agent_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_store_b32 v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -9142,6 +10988,30 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_monotonic_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_agent_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: ds_store_b32 v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_agent_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_store_b32 v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -9263,6 +11133,30 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_acquire_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_agent_one_as_monotonic_acquire_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: ds_store_b32 v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_agent_one_as_monotonic_acquire_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_store_b32 v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -9384,6 +11278,30 @@ define amdgpu_kernel void @local_agent_one_as_acquire_acquire_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_agent_one_as_acquire_acquire_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: ds_store_b32 v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_agent_one_as_acquire_acquire_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_store_b32 v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -9505,6 +11423,30 @@ define amdgpu_kernel void @local_agent_one_as_release_acquire_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_agent_one_as_release_acquire_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: ds_store_b32 v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_agent_one_as_release_acquire_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_store_b32 v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -9626,6 +11568,30 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_acquire_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_agent_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: ds_store_b32 v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_agent_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_store_b32 v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -9747,6 +11713,30 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_acquire_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_agent_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: ds_store_b32 v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_agent_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_store_b32 v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -9868,6 +11858,30 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_seq_cst_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_agent_one_as_monotonic_seq_cst_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: ds_store_b32 v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_agent_one_as_monotonic_seq_cst_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_store_b32 v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -9989,6 +12003,30 @@ define amdgpu_kernel void @local_agent_one_as_acquire_seq_cst_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_agent_one_as_acquire_seq_cst_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: ds_store_b32 v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_agent_one_as_acquire_seq_cst_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_store_b32 v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -10110,6 +12148,30 @@ define amdgpu_kernel void @local_agent_one_as_release_seq_cst_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_agent_one_as_release_seq_cst_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: ds_store_b32 v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_agent_one_as_release_seq_cst_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_store_b32 v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -10231,6 +12293,30 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_seq_cst_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_agent_one_as_acq_rel_seq_cst_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: ds_store_b32 v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_agent_one_as_acq_rel_seq_cst_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_store_b32 v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -10352,6 +12438,30 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_seq_cst_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_agent_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: ds_store_b32 v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_agent_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_store_b32 v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-nontemporal.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-nontemporal.ll
index 75a65723b087..80d66c916f2d 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-nontemporal.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-nontemporal.ll
@@ -8,6 +8,8 @@
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=+tgsplit -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-TGSPLIT %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-NOTTGSPLIT %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -mattr=+tgsplit -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-TGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11-WGP %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+cumode -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11-CU %s
define amdgpu_kernel void @local_nontemporal_load_0(
; GFX6-LABEL: local_nontemporal_load_0:
@@ -125,6 +127,32 @@ define amdgpu_kernel void @local_nontemporal_load_0(
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: global_store_dword v1, v0, s[2:3]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_nontemporal_load_0:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: ds_load_b32 v0, v0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_nontemporal_load_0:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: ds_load_b32 v0, v0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %in, i32 addrspace(1)* %out) {
entry:
%val = load i32, i32 addrspace(3)* %in, align 4, !nontemporal !0
@@ -251,6 +279,32 @@ define amdgpu_kernel void @local_nontemporal_load_1(
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: global_store_dword v1, v0, s[2:3]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_nontemporal_load_1:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_lshl_add_u32 v0, v0, 2, s2
+; GFX11-WGP-NEXT: ds_load_b32 v0, v0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_nontemporal_load_1:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_lshl_add_u32 v0, v0, 2, s2
+; GFX11-CU-NEXT: ds_load_b32 v0, v0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %in, i32 addrspace(1)* %out) {
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -373,6 +427,32 @@ define amdgpu_kernel void @local_nontemporal_store_0(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0
; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_nontemporal_store_0:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: ds_store_b32 v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_nontemporal_store_0:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_load_b32 s1, s[2:3], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: ds_store_b32 v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %in, i32 addrspace(3)* %out) {
entry:
%val = load i32, i32 addrspace(1)* %in, align 4
@@ -496,6 +576,32 @@ define amdgpu_kernel void @local_nontemporal_store_1(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0
; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_nontemporal_store_1:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0
+; GFX11-WGP-NEXT: v_lshl_add_u32 v0, v0, 2, s0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: ds_store_b32 v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_nontemporal_store_1:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_load_b32 s1, s[2:3], 0x0
+; GFX11-CU-NEXT: v_lshl_add_u32 v0, v0, 2, s0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: ds_store_b32 v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %in, i32 addrspace(3)* %out) {
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-singlethread.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-singlethread.ll
index 8e8080ed81ff..4bde0dca76fe 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-singlethread.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-singlethread.ll
@@ -8,6 +8,8 @@
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=+tgsplit -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-TGSPLIT %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-NOTTGSPLIT %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -mattr=+tgsplit -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-TGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11-WGP %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+cumode -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11-CU %s
define amdgpu_kernel void @local_singlethread_unordered_load(
; GFX6-LABEL: local_singlethread_unordered_load:
@@ -112,6 +114,28 @@ define amdgpu_kernel void @local_singlethread_unordered_load(
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: ds_write_b32 v1, v0
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_singlethread_unordered_load:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: ds_load_b32 v0, v0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: ds_store_b32 v1, v0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_singlethread_unordered_load:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: ds_load_b32 v0, v0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_store_b32 v1, v0
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
entry:
%val = load atomic i32, i32 addrspace(3)* %in syncscope("singlethread") unordered, align 4
@@ -222,6 +246,28 @@ define amdgpu_kernel void @local_singlethread_monotonic_load(
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: ds_write_b32 v1, v0
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_singlethread_monotonic_load:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: ds_load_b32 v0, v0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: ds_store_b32 v1, v0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_singlethread_monotonic_load:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: ds_load_b32 v0, v0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_store_b32 v1, v0
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
entry:
%val = load atomic i32, i32 addrspace(3)* %in syncscope("singlethread") monotonic, align 4
@@ -332,6 +378,28 @@ define amdgpu_kernel void @local_singlethread_acquire_load(
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: ds_write_b32 v1, v0
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_singlethread_acquire_load:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: ds_load_b32 v0, v0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: ds_store_b32 v1, v0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_singlethread_acquire_load:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: ds_load_b32 v0, v0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_store_b32 v1, v0
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
entry:
%val = load atomic i32, i32 addrspace(3)* %in syncscope("singlethread") acquire, align 4
@@ -442,6 +510,28 @@ define amdgpu_kernel void @local_singlethread_seq_cst_load(
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: ds_write_b32 v1, v0
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_singlethread_seq_cst_load:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: ds_load_b32 v0, v0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: ds_store_b32 v1, v0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_singlethread_seq_cst_load:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: ds_load_b32 v0, v0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_store_b32 v1, v0
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
entry:
%val = load atomic i32, i32 addrspace(3)* %in syncscope("singlethread") seq_cst, align 4
@@ -534,6 +624,24 @@ define amdgpu_kernel void @local_singlethread_unordered_store(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0
; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_singlethread_unordered_store:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0
+; GFX11-WGP-NEXT: ds_store_b32 v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_singlethread_unordered_store:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0
+; GFX11-CU-NEXT: ds_store_b32 v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 %in, i32 addrspace(3)* %out) {
entry:
store atomic i32 %in, i32 addrspace(3)* %out syncscope("singlethread") unordered, align 4
@@ -625,6 +733,24 @@ define amdgpu_kernel void @local_singlethread_monotonic_store(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0
; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_singlethread_monotonic_store:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0
+; GFX11-WGP-NEXT: ds_store_b32 v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_singlethread_monotonic_store:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0
+; GFX11-CU-NEXT: ds_store_b32 v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 %in, i32 addrspace(3)* %out) {
entry:
store atomic i32 %in, i32 addrspace(3)* %out syncscope("singlethread") monotonic, align 4
@@ -716,6 +842,24 @@ define amdgpu_kernel void @local_singlethread_release_store(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0
; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_singlethread_release_store:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0
+; GFX11-WGP-NEXT: ds_store_b32 v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_singlethread_release_store:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0
+; GFX11-CU-NEXT: ds_store_b32 v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 %in, i32 addrspace(3)* %out) {
entry:
store atomic i32 %in, i32 addrspace(3)* %out syncscope("singlethread") release, align 4
@@ -807,6 +951,24 @@ define amdgpu_kernel void @local_singlethread_seq_cst_store(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0
; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_singlethread_seq_cst_store:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0
+; GFX11-WGP-NEXT: ds_store_b32 v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_singlethread_seq_cst_store:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0
+; GFX11-CU-NEXT: ds_store_b32 v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 %in, i32 addrspace(3)* %out) {
entry:
store atomic i32 %in, i32 addrspace(3)* %out syncscope("singlethread") seq_cst, align 4
@@ -898,6 +1060,24 @@ define amdgpu_kernel void @local_singlethread_monotonic_atomicrmw(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
; GFX940-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_singlethread_monotonic_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_singlethread_monotonic_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("singlethread") monotonic
@@ -989,6 +1169,24 @@ define amdgpu_kernel void @local_singlethread_acquire_atomicrmw(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
; GFX940-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_singlethread_acquire_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_singlethread_acquire_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("singlethread") acquire
@@ -1080,6 +1278,24 @@ define amdgpu_kernel void @local_singlethread_release_atomicrmw(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
; GFX940-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_singlethread_release_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_singlethread_release_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("singlethread") release
@@ -1171,6 +1387,24 @@ define amdgpu_kernel void @local_singlethread_acq_rel_atomicrmw(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
; GFX940-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_singlethread_acq_rel_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_singlethread_acq_rel_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("singlethread") acq_rel
@@ -1262,6 +1496,24 @@ define amdgpu_kernel void @local_singlethread_seq_cst_atomicrmw(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
; GFX940-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_singlethread_seq_cst_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_singlethread_seq_cst_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("singlethread") seq_cst
@@ -1371,6 +1623,28 @@ define amdgpu_kernel void @local_singlethread_acquire_ret_atomicrmw(
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_singlethread_acquire_ret_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v1, v0, v1
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: ds_store_b32 v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_singlethread_acquire_ret_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_store_b32 v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("singlethread") acquire
@@ -1481,6 +1755,28 @@ define amdgpu_kernel void @local_singlethread_acq_rel_ret_atomicrmw(
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_singlethread_acq_rel_ret_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v1, v0, v1
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: ds_store_b32 v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_singlethread_acq_rel_ret_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_store_b32 v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("singlethread") acq_rel
@@ -1591,6 +1887,28 @@ define amdgpu_kernel void @local_singlethread_seq_cst_ret_atomicrmw(
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_singlethread_seq_cst_ret_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v1, v0, v1
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: ds_store_b32 v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_singlethread_seq_cst_ret_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_store_b32 v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("singlethread") seq_cst
@@ -1692,6 +2010,26 @@ define amdgpu_kernel void @local_singlethread_monotonic_monotonic_cmpxchg(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_singlethread_monotonic_monotonic_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_singlethread_monotonic_monotonic_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -1793,6 +2131,26 @@ define amdgpu_kernel void @local_singlethread_acquire_monotonic_cmpxchg(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_singlethread_acquire_monotonic_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_singlethread_acquire_monotonic_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -1894,6 +2252,26 @@ define amdgpu_kernel void @local_singlethread_release_monotonic_cmpxchg(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_singlethread_release_monotonic_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_singlethread_release_monotonic_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -1995,6 +2373,26 @@ define amdgpu_kernel void @local_singlethread_acq_rel_monotonic_cmpxchg(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_singlethread_acq_rel_monotonic_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_singlethread_acq_rel_monotonic_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -2096,6 +2494,26 @@ define amdgpu_kernel void @local_singlethread_seq_cst_monotonic_cmpxchg(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_singlethread_seq_cst_monotonic_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_singlethread_seq_cst_monotonic_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -2197,6 +2615,26 @@ define amdgpu_kernel void @local_singlethread_monotonic_acquire_cmpxchg(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_singlethread_monotonic_acquire_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_singlethread_monotonic_acquire_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -2298,6 +2736,26 @@ define amdgpu_kernel void @local_singlethread_acquire_acquire_cmpxchg(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_singlethread_acquire_acquire_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_singlethread_acquire_acquire_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -2399,6 +2857,26 @@ define amdgpu_kernel void @local_singlethread_release_acquire_cmpxchg(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_singlethread_release_acquire_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_singlethread_release_acquire_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -2500,6 +2978,26 @@ define amdgpu_kernel void @local_singlethread_acq_rel_acquire_cmpxchg(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_singlethread_acq_rel_acquire_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_singlethread_acq_rel_acquire_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -2601,6 +3099,26 @@ define amdgpu_kernel void @local_singlethread_seq_cst_acquire_cmpxchg(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_singlethread_seq_cst_acquire_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_singlethread_seq_cst_acquire_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -2702,6 +3220,26 @@ define amdgpu_kernel void @local_singlethread_monotonic_seq_cst_cmpxchg(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_singlethread_monotonic_seq_cst_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_singlethread_monotonic_seq_cst_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -2803,6 +3341,26 @@ define amdgpu_kernel void @local_singlethread_acquire_seq_cst_cmpxchg(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_singlethread_acquire_seq_cst_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_singlethread_acquire_seq_cst_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -2904,6 +3462,26 @@ define amdgpu_kernel void @local_singlethread_release_seq_cst_cmpxchg(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_singlethread_release_seq_cst_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_singlethread_release_seq_cst_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -3005,6 +3583,26 @@ define amdgpu_kernel void @local_singlethread_acq_rel_seq_cst_cmpxchg(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_singlethread_acq_rel_seq_cst_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_singlethread_acq_rel_seq_cst_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -3106,6 +3704,26 @@ define amdgpu_kernel void @local_singlethread_seq_cst_seq_cst_cmpxchg(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_singlethread_seq_cst_seq_cst_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_singlethread_seq_cst_seq_cst_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -3225,6 +3843,30 @@ define amdgpu_kernel void @local_singlethread_monotonic_monotonic_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_singlethread_monotonic_monotonic_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: ds_store_b32 v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_singlethread_monotonic_monotonic_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_store_b32 v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -3346,6 +3988,30 @@ define amdgpu_kernel void @local_singlethread_acquire_monotonic_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_singlethread_acquire_monotonic_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: ds_store_b32 v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_singlethread_acquire_monotonic_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_store_b32 v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -3467,6 +4133,30 @@ define amdgpu_kernel void @local_singlethread_release_monotonic_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_singlethread_release_monotonic_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: ds_store_b32 v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_singlethread_release_monotonic_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_store_b32 v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -3588,6 +4278,30 @@ define amdgpu_kernel void @local_singlethread_acq_rel_monotonic_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_singlethread_acq_rel_monotonic_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: ds_store_b32 v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_singlethread_acq_rel_monotonic_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_store_b32 v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -3709,6 +4423,30 @@ define amdgpu_kernel void @local_singlethread_seq_cst_monotonic_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_singlethread_seq_cst_monotonic_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: ds_store_b32 v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_singlethread_seq_cst_monotonic_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_store_b32 v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -3830,6 +4568,30 @@ define amdgpu_kernel void @local_singlethread_monotonic_acquire_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_singlethread_monotonic_acquire_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: ds_store_b32 v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_singlethread_monotonic_acquire_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_store_b32 v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -3951,6 +4713,30 @@ define amdgpu_kernel void @local_singlethread_acquire_acquire_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_singlethread_acquire_acquire_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: ds_store_b32 v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_singlethread_acquire_acquire_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_store_b32 v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -4072,6 +4858,30 @@ define amdgpu_kernel void @local_singlethread_release_acquire_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_singlethread_release_acquire_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: ds_store_b32 v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_singlethread_release_acquire_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_store_b32 v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -4193,6 +5003,30 @@ define amdgpu_kernel void @local_singlethread_acq_rel_acquire_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_singlethread_acq_rel_acquire_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: ds_store_b32 v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_singlethread_acq_rel_acquire_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_store_b32 v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -4314,6 +5148,30 @@ define amdgpu_kernel void @local_singlethread_seq_cst_acquire_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_singlethread_seq_cst_acquire_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: ds_store_b32 v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_singlethread_seq_cst_acquire_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_store_b32 v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -4435,6 +5293,30 @@ define amdgpu_kernel void @local_singlethread_monotonic_seq_cst_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_singlethread_monotonic_seq_cst_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: ds_store_b32 v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_singlethread_monotonic_seq_cst_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_store_b32 v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -4556,6 +5438,30 @@ define amdgpu_kernel void @local_singlethread_acquire_seq_cst_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_singlethread_acquire_seq_cst_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: ds_store_b32 v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_singlethread_acquire_seq_cst_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_store_b32 v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -4677,6 +5583,30 @@ define amdgpu_kernel void @local_singlethread_release_seq_cst_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_singlethread_release_seq_cst_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: ds_store_b32 v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_singlethread_release_seq_cst_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_store_b32 v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -4798,6 +5728,30 @@ define amdgpu_kernel void @local_singlethread_acq_rel_seq_cst_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_singlethread_acq_rel_seq_cst_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: ds_store_b32 v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_singlethread_acq_rel_seq_cst_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_store_b32 v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -4919,6 +5873,30 @@ define amdgpu_kernel void @local_singlethread_seq_cst_seq_cst_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_singlethread_seq_cst_seq_cst_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: ds_store_b32 v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_singlethread_seq_cst_seq_cst_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_store_b32 v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -5031,6 +6009,28 @@ define amdgpu_kernel void @local_singlethread_one_as_unordered_load(
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: ds_write_b32 v1, v0
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_singlethread_one_as_unordered_load:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: ds_load_b32 v0, v0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: ds_store_b32 v1, v0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_singlethread_one_as_unordered_load:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: ds_load_b32 v0, v0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_store_b32 v1, v0
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
entry:
%val = load atomic i32, i32 addrspace(3)* %in syncscope("singlethread-one-as") unordered, align 4
@@ -5141,6 +6141,28 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_load(
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: ds_write_b32 v1, v0
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_singlethread_one_as_monotonic_load:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: ds_load_b32 v0, v0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: ds_store_b32 v1, v0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_singlethread_one_as_monotonic_load:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: ds_load_b32 v0, v0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_store_b32 v1, v0
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
entry:
%val = load atomic i32, i32 addrspace(3)* %in syncscope("singlethread-one-as") monotonic, align 4
@@ -5251,6 +6273,28 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_load(
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: ds_write_b32 v1, v0
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_singlethread_one_as_acquire_load:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: ds_load_b32 v0, v0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: ds_store_b32 v1, v0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_singlethread_one_as_acquire_load:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: ds_load_b32 v0, v0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_store_b32 v1, v0
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
entry:
%val = load atomic i32, i32 addrspace(3)* %in syncscope("singlethread-one-as") acquire, align 4
@@ -5361,6 +6405,28 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_load(
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: ds_write_b32 v1, v0
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_singlethread_one_as_seq_cst_load:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: ds_load_b32 v0, v0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: ds_store_b32 v1, v0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_singlethread_one_as_seq_cst_load:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: ds_load_b32 v0, v0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_store_b32 v1, v0
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
entry:
%val = load atomic i32, i32 addrspace(3)* %in syncscope("singlethread-one-as") seq_cst, align 4
@@ -5453,6 +6519,24 @@ define amdgpu_kernel void @local_singlethread_one_as_unordered_store(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0
; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_singlethread_one_as_unordered_store:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0
+; GFX11-WGP-NEXT: ds_store_b32 v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_singlethread_one_as_unordered_store:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0
+; GFX11-CU-NEXT: ds_store_b32 v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 %in, i32 addrspace(3)* %out) {
entry:
store atomic i32 %in, i32 addrspace(3)* %out syncscope("singlethread-one-as") unordered, align 4
@@ -5544,6 +6628,24 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_store(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0
; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_singlethread_one_as_monotonic_store:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0
+; GFX11-WGP-NEXT: ds_store_b32 v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_singlethread_one_as_monotonic_store:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0
+; GFX11-CU-NEXT: ds_store_b32 v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 %in, i32 addrspace(3)* %out) {
entry:
store atomic i32 %in, i32 addrspace(3)* %out syncscope("singlethread-one-as") monotonic, align 4
@@ -5635,6 +6737,24 @@ define amdgpu_kernel void @local_singlethread_one_as_release_store(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0
; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_singlethread_one_as_release_store:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0
+; GFX11-WGP-NEXT: ds_store_b32 v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_singlethread_one_as_release_store:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0
+; GFX11-CU-NEXT: ds_store_b32 v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 %in, i32 addrspace(3)* %out) {
entry:
store atomic i32 %in, i32 addrspace(3)* %out syncscope("singlethread-one-as") release, align 4
@@ -5726,6 +6846,24 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_store(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0
; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_singlethread_one_as_seq_cst_store:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0
+; GFX11-WGP-NEXT: ds_store_b32 v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_singlethread_one_as_seq_cst_store:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0
+; GFX11-CU-NEXT: ds_store_b32 v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 %in, i32 addrspace(3)* %out) {
entry:
store atomic i32 %in, i32 addrspace(3)* %out syncscope("singlethread-one-as") seq_cst, align 4
@@ -5817,6 +6955,24 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_atomicrmw(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
; GFX940-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_singlethread_one_as_monotonic_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_singlethread_one_as_monotonic_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("singlethread-one-as") monotonic
@@ -5908,6 +7064,24 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_atomicrmw(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
; GFX940-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_singlethread_one_as_acquire_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_singlethread_one_as_acquire_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("singlethread-one-as") acquire
@@ -5999,6 +7173,24 @@ define amdgpu_kernel void @local_singlethread_one_as_release_atomicrmw(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
; GFX940-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_singlethread_one_as_release_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_singlethread_one_as_release_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("singlethread-one-as") release
@@ -6090,6 +7282,24 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_atomicrmw(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
; GFX940-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_singlethread_one_as_acq_rel_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_singlethread_one_as_acq_rel_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("singlethread-one-as") acq_rel
@@ -6181,6 +7391,24 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_atomicrmw(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
; GFX940-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_singlethread_one_as_seq_cst_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_singlethread_one_as_seq_cst_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("singlethread-one-as") seq_cst
@@ -6290,6 +7518,28 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_ret_atomicrmw(
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_singlethread_one_as_acquire_ret_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v1, v0, v1
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: ds_store_b32 v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_singlethread_one_as_acquire_ret_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_store_b32 v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("singlethread-one-as") acquire
@@ -6400,6 +7650,28 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_ret_atomicrmw(
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_singlethread_one_as_acq_rel_ret_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v1, v0, v1
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: ds_store_b32 v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_singlethread_one_as_acq_rel_ret_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_store_b32 v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("singlethread-one-as") acq_rel
@@ -6510,6 +7782,28 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_ret_atomicrmw(
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_singlethread_one_as_seq_cst_ret_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v1, v0, v1
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: ds_store_b32 v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_singlethread_one_as_seq_cst_ret_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_store_b32 v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("singlethread-one-as") seq_cst
@@ -6611,6 +7905,26 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_monotonic_cmpxchg
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_singlethread_one_as_monotonic_monotonic_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_singlethread_one_as_monotonic_monotonic_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -6712,6 +8026,26 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_monotonic_cmpxchg(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_singlethread_one_as_acquire_monotonic_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_singlethread_one_as_acquire_monotonic_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -6813,6 +8147,26 @@ define amdgpu_kernel void @local_singlethread_one_as_release_monotonic_cmpxchg(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_singlethread_one_as_release_monotonic_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_singlethread_one_as_release_monotonic_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -6914,6 +8268,26 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_monotonic_cmpxchg(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_singlethread_one_as_acq_rel_monotonic_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_singlethread_one_as_acq_rel_monotonic_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -7015,6 +8389,26 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_monotonic_cmpxchg(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_singlethread_one_as_seq_cst_monotonic_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_singlethread_one_as_seq_cst_monotonic_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -7116,6 +8510,26 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_acquire_cmpxchg(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_singlethread_one_as_monotonic_acquire_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_singlethread_one_as_monotonic_acquire_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -7217,6 +8631,26 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_acquire_cmpxchg(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_singlethread_one_as_acquire_acquire_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_singlethread_one_as_acquire_acquire_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -7318,6 +8752,26 @@ define amdgpu_kernel void @local_singlethread_one_as_release_acquire_cmpxchg(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_singlethread_one_as_release_acquire_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_singlethread_one_as_release_acquire_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -7419,6 +8873,26 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_acquire_cmpxchg(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_singlethread_one_as_acq_rel_acquire_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_singlethread_one_as_acq_rel_acquire_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -7520,6 +8994,26 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_acquire_cmpxchg(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_singlethread_one_as_seq_cst_acquire_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_singlethread_one_as_seq_cst_acquire_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -7621,6 +9115,26 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_seq_cst_cmpxchg(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_singlethread_one_as_monotonic_seq_cst_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_singlethread_one_as_monotonic_seq_cst_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -7722,6 +9236,26 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_seq_cst_cmpxchg(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_singlethread_one_as_acquire_seq_cst_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_singlethread_one_as_acquire_seq_cst_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -7823,6 +9357,26 @@ define amdgpu_kernel void @local_singlethread_one_as_release_seq_cst_cmpxchg(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_singlethread_one_as_release_seq_cst_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_singlethread_one_as_release_seq_cst_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -7924,6 +9478,26 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_seq_cst_cmpxchg(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_singlethread_one_as_acq_rel_seq_cst_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_singlethread_one_as_acq_rel_seq_cst_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -8025,6 +9599,26 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_seq_cst_cmpxchg(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_singlethread_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_singlethread_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -8144,6 +9738,30 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_monotonic_ret_cmp
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_singlethread_one_as_monotonic_monotonic_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: ds_store_b32 v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_singlethread_one_as_monotonic_monotonic_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_store_b32 v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -8265,6 +9883,30 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_monotonic_ret_cmpxc
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_singlethread_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: ds_store_b32 v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_singlethread_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_store_b32 v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -8386,6 +10028,30 @@ define amdgpu_kernel void @local_singlethread_one_as_release_monotonic_ret_cmpxc
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_singlethread_one_as_release_monotonic_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: ds_store_b32 v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_singlethread_one_as_release_monotonic_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_store_b32 v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -8507,6 +10173,30 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_monotonic_ret_cmpxc
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: ds_store_b32 v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_store_b32 v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -8628,6 +10318,30 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_monotonic_ret_cmpxc
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: ds_store_b32 v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_store_b32 v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -8749,6 +10463,30 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_acquire_ret_cmpxc
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_singlethread_one_as_monotonic_acquire_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: ds_store_b32 v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_singlethread_one_as_monotonic_acquire_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_store_b32 v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -8870,6 +10608,30 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_acquire_ret_cmpxchg
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_singlethread_one_as_acquire_acquire_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: ds_store_b32 v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_singlethread_one_as_acquire_acquire_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_store_b32 v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -8991,6 +10753,30 @@ define amdgpu_kernel void @local_singlethread_one_as_release_acquire_ret_cmpxchg
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_singlethread_one_as_release_acquire_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: ds_store_b32 v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_singlethread_one_as_release_acquire_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_store_b32 v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -9112,6 +10898,30 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_acquire_ret_cmpxchg
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_singlethread_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: ds_store_b32 v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_singlethread_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_store_b32 v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -9233,6 +11043,30 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_acquire_ret_cmpxchg
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_singlethread_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: ds_store_b32 v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_singlethread_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_store_b32 v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -9354,6 +11188,30 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_seq_cst_ret_cmpxc
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: ds_store_b32 v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_store_b32 v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -9475,6 +11333,30 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_seq_cst_ret_cmpxchg
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_singlethread_one_as_acquire_seq_cst_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: ds_store_b32 v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_singlethread_one_as_acquire_seq_cst_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_store_b32 v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -9596,6 +11478,30 @@ define amdgpu_kernel void @local_singlethread_one_as_release_seq_cst_ret_cmpxchg
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_singlethread_one_as_release_seq_cst_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: ds_store_b32 v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_singlethread_one_as_release_seq_cst_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_store_b32 v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -9717,6 +11623,30 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: ds_store_b32 v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_store_b32 v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -9838,6 +11768,30 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: ds_store_b32 v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_store_b32 v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-system.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-system.ll
index 8e0770e9007d..795b2a6359f3 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-system.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-system.ll
@@ -8,6 +8,8 @@
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=+tgsplit -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-TGSPLIT %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-NOTTGSPLIT %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -mattr=+tgsplit -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-TGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11-WGP %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+cumode -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11-CU %s
define amdgpu_kernel void @local_system_unordered_load(
; GFX6-LABEL: local_system_unordered_load:
@@ -112,6 +114,28 @@ define amdgpu_kernel void @local_system_unordered_load(
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: ds_write_b32 v1, v0
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_system_unordered_load:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: ds_load_b32 v0, v0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: ds_store_b32 v1, v0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_system_unordered_load:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: ds_load_b32 v0, v0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_store_b32 v1, v0
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
entry:
%val = load atomic i32, i32 addrspace(3)* %in unordered, align 4
@@ -222,6 +246,28 @@ define amdgpu_kernel void @local_system_monotonic_load(
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: ds_write_b32 v1, v0
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_system_monotonic_load:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: ds_load_b32 v0, v0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: ds_store_b32 v1, v0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_system_monotonic_load:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: ds_load_b32 v0, v0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_store_b32 v1, v0
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
entry:
%val = load atomic i32, i32 addrspace(3)* %in monotonic, align 4
@@ -335,6 +381,29 @@ define amdgpu_kernel void @local_system_acquire_load(
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: ds_write_b32 v1, v0
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_system_acquire_load:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: ds_load_b32 v0, v0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: ds_store_b32 v1, v0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_system_acquire_load:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: ds_load_b32 v0, v0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_store_b32 v1, v0
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
entry:
%val = load atomic i32, i32 addrspace(3)* %in acquire, align 4
@@ -458,6 +527,32 @@ define amdgpu_kernel void @local_system_seq_cst_load(
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: ds_write_b32 v1, v0
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_system_seq_cst_load:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: ds_load_b32 v0, v0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: ds_store_b32 v1, v0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_system_seq_cst_load:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_load_b32 v0, v0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_store_b32 v1, v0
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
entry:
%val = load atomic i32, i32 addrspace(3)* %in seq_cst, align 4
@@ -550,6 +645,24 @@ define amdgpu_kernel void @local_system_unordered_store(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0
; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_system_unordered_store:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0
+; GFX11-WGP-NEXT: ds_store_b32 v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_system_unordered_store:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0
+; GFX11-CU-NEXT: ds_store_b32 v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 %in, i32 addrspace(3)* %out) {
entry:
store atomic i32 %in, i32 addrspace(3)* %out unordered, align 4
@@ -641,6 +754,24 @@ define amdgpu_kernel void @local_system_monotonic_store(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0
; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_system_monotonic_store:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0
+; GFX11-WGP-NEXT: ds_store_b32 v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_system_monotonic_store:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0
+; GFX11-CU-NEXT: ds_store_b32 v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 %in, i32 addrspace(3)* %out) {
entry:
store atomic i32 %in, i32 addrspace(3)* %out monotonic, align 4
@@ -742,6 +873,27 @@ define amdgpu_kernel void @local_system_release_store(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_system_release_store:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: ds_store_b32 v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_system_release_store:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_store_b32 v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 %in, i32 addrspace(3)* %out) {
entry:
store atomic i32 %in, i32 addrspace(3)* %out release, align 4
@@ -843,6 +995,27 @@ define amdgpu_kernel void @local_system_seq_cst_store(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_system_seq_cst_store:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: ds_store_b32 v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_system_seq_cst_store:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_store_b32 v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 %in, i32 addrspace(3)* %out) {
entry:
store atomic i32 %in, i32 addrspace(3)* %out seq_cst, align 4
@@ -934,6 +1107,24 @@ define amdgpu_kernel void @local_system_monotonic_atomicrmw(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
; GFX940-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_system_monotonic_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_system_monotonic_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in monotonic
@@ -1035,6 +1226,27 @@ define amdgpu_kernel void @local_system_acquire_atomicrmw(
; GFX940-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
; GFX940-TGSPLIT-NEXT: buffer_inv sc0
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_system_acquire_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_system_acquire_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in acquire
@@ -1136,6 +1348,27 @@ define amdgpu_kernel void @local_system_release_atomicrmw(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_system_release_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_system_release_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in release
@@ -1247,6 +1480,30 @@ define amdgpu_kernel void @local_system_acq_rel_atomicrmw(
; GFX940-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
; GFX940-TGSPLIT-NEXT: buffer_inv sc0
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_system_acq_rel_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_system_acq_rel_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in acq_rel
@@ -1358,6 +1615,30 @@ define amdgpu_kernel void @local_system_seq_cst_atomicrmw(
; GFX940-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
; GFX940-TGSPLIT-NEXT: buffer_inv sc0
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_system_seq_cst_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_system_seq_cst_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in seq_cst
@@ -1470,6 +1751,29 @@ define amdgpu_kernel void @local_system_acquire_ret_atomicrmw(
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_system_acquire_ret_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v1, v0, v1
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: ds_store_b32 v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_system_acquire_ret_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_store_b32 v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in acquire
@@ -1593,6 +1897,32 @@ define amdgpu_kernel void @local_system_acq_rel_ret_atomicrmw(
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_system_acq_rel_ret_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v1, v0, v1
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: ds_store_b32 v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_system_acq_rel_ret_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_store_b32 v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in acq_rel
@@ -1716,6 +2046,32 @@ define amdgpu_kernel void @local_system_seq_cst_ret_atomicrmw(
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_system_seq_cst_ret_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v1, v0, v1
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: ds_store_b32 v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_system_seq_cst_ret_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_store_b32 v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in seq_cst
@@ -1817,6 +2173,26 @@ define amdgpu_kernel void @local_system_monotonic_monotonic_cmpxchg(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_system_monotonic_monotonic_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_system_monotonic_monotonic_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -1928,6 +2304,29 @@ define amdgpu_kernel void @local_system_acquire_monotonic_cmpxchg(
; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX940-TGSPLIT-NEXT: buffer_inv sc0
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_system_acquire_monotonic_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_system_acquire_monotonic_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -2039,6 +2438,29 @@ define amdgpu_kernel void @local_system_release_monotonic_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_system_release_monotonic_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_system_release_monotonic_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -2160,6 +2582,32 @@ define amdgpu_kernel void @local_system_acq_rel_monotonic_cmpxchg(
; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX940-TGSPLIT-NEXT: buffer_inv sc0
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_system_acq_rel_monotonic_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_system_acq_rel_monotonic_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -2281,6 +2729,32 @@ define amdgpu_kernel void @local_system_seq_cst_monotonic_cmpxchg(
; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX940-TGSPLIT-NEXT: buffer_inv sc0
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_system_seq_cst_monotonic_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_system_seq_cst_monotonic_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -2392,6 +2866,29 @@ define amdgpu_kernel void @local_system_monotonic_acquire_cmpxchg(
; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX940-TGSPLIT-NEXT: buffer_inv sc0
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_system_monotonic_acquire_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_system_monotonic_acquire_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -2503,6 +3000,29 @@ define amdgpu_kernel void @local_system_acquire_acquire_cmpxchg(
; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX940-TGSPLIT-NEXT: buffer_inv sc0
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_system_acquire_acquire_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_system_acquire_acquire_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -2624,6 +3144,32 @@ define amdgpu_kernel void @local_system_release_acquire_cmpxchg(
; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX940-TGSPLIT-NEXT: buffer_inv sc0
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_system_release_acquire_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_system_release_acquire_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -2745,6 +3291,32 @@ define amdgpu_kernel void @local_system_acq_rel_acquire_cmpxchg(
; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX940-TGSPLIT-NEXT: buffer_inv sc0
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_system_acq_rel_acquire_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_system_acq_rel_acquire_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -2866,6 +3438,32 @@ define amdgpu_kernel void @local_system_seq_cst_acquire_cmpxchg(
; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX940-TGSPLIT-NEXT: buffer_inv sc0
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_system_seq_cst_acquire_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_system_seq_cst_acquire_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -2987,6 +3585,32 @@ define amdgpu_kernel void @local_system_monotonic_seq_cst_cmpxchg(
; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX940-TGSPLIT-NEXT: buffer_inv sc0
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_system_monotonic_seq_cst_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_system_monotonic_seq_cst_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -3108,6 +3732,32 @@ define amdgpu_kernel void @local_system_acquire_seq_cst_cmpxchg(
; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX940-TGSPLIT-NEXT: buffer_inv sc0
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_system_acquire_seq_cst_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_system_acquire_seq_cst_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -3229,6 +3879,32 @@ define amdgpu_kernel void @local_system_release_seq_cst_cmpxchg(
; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX940-TGSPLIT-NEXT: buffer_inv sc0
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_system_release_seq_cst_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_system_release_seq_cst_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -3350,6 +4026,32 @@ define amdgpu_kernel void @local_system_acq_rel_seq_cst_cmpxchg(
; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX940-TGSPLIT-NEXT: buffer_inv sc0
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_system_acq_rel_seq_cst_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_system_acq_rel_seq_cst_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -3471,6 +4173,32 @@ define amdgpu_kernel void @local_system_seq_cst_seq_cst_cmpxchg(
; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX940-TGSPLIT-NEXT: buffer_inv sc0
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_system_seq_cst_seq_cst_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_system_seq_cst_seq_cst_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -3590,6 +4318,30 @@ define amdgpu_kernel void @local_system_monotonic_monotonic_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_system_monotonic_monotonic_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: ds_store_b32 v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_system_monotonic_monotonic_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_store_b32 v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -3714,6 +4466,31 @@ define amdgpu_kernel void @local_system_acquire_monotonic_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_system_acquire_monotonic_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: ds_store_b32 v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_system_acquire_monotonic_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_store_b32 v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -3845,6 +4622,33 @@ define amdgpu_kernel void @local_system_release_monotonic_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_system_release_monotonic_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: ds_store_b32 v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_system_release_monotonic_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_store_b32 v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -3979,6 +4783,34 @@ define amdgpu_kernel void @local_system_acq_rel_monotonic_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_system_acq_rel_monotonic_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: ds_store_b32 v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_system_acq_rel_monotonic_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_store_b32 v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -4113,6 +4945,34 @@ define amdgpu_kernel void @local_system_seq_cst_monotonic_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_system_seq_cst_monotonic_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: ds_store_b32 v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_system_seq_cst_monotonic_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_store_b32 v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -4237,6 +5097,31 @@ define amdgpu_kernel void @local_system_monotonic_acquire_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_system_monotonic_acquire_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: ds_store_b32 v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_system_monotonic_acquire_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_store_b32 v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -4361,6 +5246,31 @@ define amdgpu_kernel void @local_system_acquire_acquire_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_system_acquire_acquire_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: ds_store_b32 v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_system_acquire_acquire_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_store_b32 v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -4495,6 +5405,34 @@ define amdgpu_kernel void @local_system_release_acquire_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_system_release_acquire_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: ds_store_b32 v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_system_release_acquire_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_store_b32 v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -4629,6 +5567,34 @@ define amdgpu_kernel void @local_system_acq_rel_acquire_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_system_acq_rel_acquire_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: ds_store_b32 v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_system_acq_rel_acquire_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_store_b32 v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -4763,6 +5729,34 @@ define amdgpu_kernel void @local_system_seq_cst_acquire_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_system_seq_cst_acquire_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: ds_store_b32 v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_system_seq_cst_acquire_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_store_b32 v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -4897,6 +5891,34 @@ define amdgpu_kernel void @local_system_monotonic_seq_cst_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_system_monotonic_seq_cst_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: ds_store_b32 v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_system_monotonic_seq_cst_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_store_b32 v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -5031,6 +6053,34 @@ define amdgpu_kernel void @local_system_acquire_seq_cst_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_system_acquire_seq_cst_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: ds_store_b32 v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_system_acquire_seq_cst_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_store_b32 v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -5165,6 +6215,34 @@ define amdgpu_kernel void @local_system_release_seq_cst_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_system_release_seq_cst_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: ds_store_b32 v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_system_release_seq_cst_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_store_b32 v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -5299,6 +6377,34 @@ define amdgpu_kernel void @local_system_acq_rel_seq_cst_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_system_acq_rel_seq_cst_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: ds_store_b32 v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_system_acq_rel_seq_cst_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_store_b32 v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -5433,6 +6539,34 @@ define amdgpu_kernel void @local_system_seq_cst_seq_cst_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_system_seq_cst_seq_cst_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: ds_store_b32 v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_system_seq_cst_seq_cst_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_store_b32 v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -5545,6 +6679,28 @@ define amdgpu_kernel void @local_system_one_as_unordered_load(
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: ds_write_b32 v1, v0
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_system_one_as_unordered_load:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: ds_load_b32 v0, v0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: ds_store_b32 v1, v0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_system_one_as_unordered_load:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: ds_load_b32 v0, v0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_store_b32 v1, v0
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
entry:
%val = load atomic i32, i32 addrspace(3)* %in syncscope("one-as") unordered, align 4
@@ -5655,6 +6811,28 @@ define amdgpu_kernel void @local_system_one_as_monotonic_load(
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: ds_write_b32 v1, v0
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_system_one_as_monotonic_load:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: ds_load_b32 v0, v0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: ds_store_b32 v1, v0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_system_one_as_monotonic_load:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: ds_load_b32 v0, v0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_store_b32 v1, v0
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
entry:
%val = load atomic i32, i32 addrspace(3)* %in syncscope("one-as") monotonic, align 4
@@ -5765,6 +6943,28 @@ define amdgpu_kernel void @local_system_one_as_acquire_load(
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: ds_write_b32 v1, v0
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_system_one_as_acquire_load:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: ds_load_b32 v0, v0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: ds_store_b32 v1, v0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_system_one_as_acquire_load:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: ds_load_b32 v0, v0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_store_b32 v1, v0
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
entry:
%val = load atomic i32, i32 addrspace(3)* %in syncscope("one-as") acquire, align 4
@@ -5875,6 +7075,28 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_load(
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: ds_write_b32 v1, v0
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_system_one_as_seq_cst_load:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: ds_load_b32 v0, v0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: ds_store_b32 v1, v0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_system_one_as_seq_cst_load:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: ds_load_b32 v0, v0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_store_b32 v1, v0
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
entry:
%val = load atomic i32, i32 addrspace(3)* %in syncscope("one-as") seq_cst, align 4
@@ -5967,6 +7189,24 @@ define amdgpu_kernel void @local_system_one_as_unordered_store(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0
; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_system_one_as_unordered_store:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0
+; GFX11-WGP-NEXT: ds_store_b32 v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_system_one_as_unordered_store:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0
+; GFX11-CU-NEXT: ds_store_b32 v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 %in, i32 addrspace(3)* %out) {
entry:
store atomic i32 %in, i32 addrspace(3)* %out syncscope("one-as") unordered, align 4
@@ -6058,6 +7298,24 @@ define amdgpu_kernel void @local_system_one_as_monotonic_store(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0
; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_system_one_as_monotonic_store:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0
+; GFX11-WGP-NEXT: ds_store_b32 v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_system_one_as_monotonic_store:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0
+; GFX11-CU-NEXT: ds_store_b32 v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 %in, i32 addrspace(3)* %out) {
entry:
store atomic i32 %in, i32 addrspace(3)* %out syncscope("one-as") monotonic, align 4
@@ -6149,6 +7407,24 @@ define amdgpu_kernel void @local_system_one_as_release_store(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0
; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_system_one_as_release_store:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0
+; GFX11-WGP-NEXT: ds_store_b32 v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_system_one_as_release_store:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0
+; GFX11-CU-NEXT: ds_store_b32 v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 %in, i32 addrspace(3)* %out) {
entry:
store atomic i32 %in, i32 addrspace(3)* %out syncscope("one-as") release, align 4
@@ -6240,6 +7516,24 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_store(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0
; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_system_one_as_seq_cst_store:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0
+; GFX11-WGP-NEXT: ds_store_b32 v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_system_one_as_seq_cst_store:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0
+; GFX11-CU-NEXT: ds_store_b32 v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 %in, i32 addrspace(3)* %out) {
entry:
store atomic i32 %in, i32 addrspace(3)* %out syncscope("one-as") seq_cst, align 4
@@ -6331,6 +7625,24 @@ define amdgpu_kernel void @local_system_one_as_monotonic_atomicrmw(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
; GFX940-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_system_one_as_monotonic_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_system_one_as_monotonic_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("one-as") monotonic
@@ -6422,6 +7734,24 @@ define amdgpu_kernel void @local_system_one_as_acquire_atomicrmw(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
; GFX940-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_system_one_as_acquire_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_system_one_as_acquire_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("one-as") acquire
@@ -6513,6 +7843,24 @@ define amdgpu_kernel void @local_system_one_as_release_atomicrmw(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
; GFX940-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_system_one_as_release_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_system_one_as_release_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("one-as") release
@@ -6604,6 +7952,24 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_atomicrmw(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
; GFX940-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_system_one_as_acq_rel_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_system_one_as_acq_rel_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("one-as") acq_rel
@@ -6695,6 +8061,24 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_atomicrmw(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
; GFX940-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_system_one_as_seq_cst_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_system_one_as_seq_cst_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("one-as") seq_cst
@@ -6804,6 +8188,28 @@ define amdgpu_kernel void @local_system_one_as_acquire_ret_atomicrmw(
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_system_one_as_acquire_ret_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v1, v0, v1
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: ds_store_b32 v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_system_one_as_acquire_ret_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_store_b32 v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("one-as") acquire
@@ -6914,6 +8320,28 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_ret_atomicrmw(
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_system_one_as_acq_rel_ret_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v1, v0, v1
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: ds_store_b32 v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_system_one_as_acq_rel_ret_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_store_b32 v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("one-as") acq_rel
@@ -7024,6 +8452,28 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_ret_atomicrmw(
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_system_one_as_seq_cst_ret_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v1, v0, v1
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: ds_store_b32 v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_system_one_as_seq_cst_ret_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_store_b32 v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("one-as") seq_cst
@@ -7125,6 +8575,26 @@ define amdgpu_kernel void @local_system_one_as_monotonic_monotonic_cmpxchg(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_system_one_as_monotonic_monotonic_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_system_one_as_monotonic_monotonic_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -7226,6 +8696,26 @@ define amdgpu_kernel void @local_system_one_as_acquire_monotonic_cmpxchg(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_system_one_as_acquire_monotonic_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_system_one_as_acquire_monotonic_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -7327,6 +8817,26 @@ define amdgpu_kernel void @local_system_one_as_release_monotonic_cmpxchg(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_system_one_as_release_monotonic_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_system_one_as_release_monotonic_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -7428,6 +8938,26 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_monotonic_cmpxchg(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_system_one_as_acq_rel_monotonic_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_system_one_as_acq_rel_monotonic_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -7529,6 +9059,26 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_monotonic_cmpxchg(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_system_one_as_seq_cst_monotonic_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_system_one_as_seq_cst_monotonic_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -7630,6 +9180,26 @@ define amdgpu_kernel void @local_system_one_as_monotonic_acquire_cmpxchg(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_system_one_as_monotonic_acquire_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_system_one_as_monotonic_acquire_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -7731,6 +9301,26 @@ define amdgpu_kernel void @local_system_one_as_acquire_acquire_cmpxchg(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_system_one_as_acquire_acquire_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_system_one_as_acquire_acquire_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -7832,6 +9422,26 @@ define amdgpu_kernel void @local_system_one_as_release_acquire_cmpxchg(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_system_one_as_release_acquire_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_system_one_as_release_acquire_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -7933,6 +9543,26 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_acquire_cmpxchg(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_system_one_as_acq_rel_acquire_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_system_one_as_acq_rel_acquire_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -8034,6 +9664,26 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_acquire_cmpxchg(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_system_one_as_seq_cst_acquire_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_system_one_as_seq_cst_acquire_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -8135,6 +9785,26 @@ define amdgpu_kernel void @local_system_one_as_monotonic_seq_cst_cmpxchg(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_system_one_as_monotonic_seq_cst_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_system_one_as_monotonic_seq_cst_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -8236,6 +9906,26 @@ define amdgpu_kernel void @local_system_one_as_acquire_seq_cst_cmpxchg(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_system_one_as_acquire_seq_cst_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_system_one_as_acquire_seq_cst_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -8337,6 +10027,26 @@ define amdgpu_kernel void @local_system_one_as_release_seq_cst_cmpxchg(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_system_one_as_release_seq_cst_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_system_one_as_release_seq_cst_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -8438,6 +10148,26 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_seq_cst_cmpxchg(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_system_one_as_acq_rel_seq_cst_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_system_one_as_acq_rel_seq_cst_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -8539,6 +10269,26 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_seq_cst_cmpxchg(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_system_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_system_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -8658,6 +10408,30 @@ define amdgpu_kernel void @local_system_one_as_monotonic_monotonic_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_system_one_as_monotonic_monotonic_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: ds_store_b32 v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_system_one_as_monotonic_monotonic_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_store_b32 v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -8779,6 +10553,30 @@ define amdgpu_kernel void @local_system_one_as_acquire_monotonic_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_system_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: ds_store_b32 v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_system_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_store_b32 v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -8900,6 +10698,30 @@ define amdgpu_kernel void @local_system_one_as_release_monotonic_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_system_one_as_release_monotonic_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: ds_store_b32 v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_system_one_as_release_monotonic_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_store_b32 v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -9021,6 +10843,30 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_monotonic_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_system_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: ds_store_b32 v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_system_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_store_b32 v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -9142,6 +10988,30 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_monotonic_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_system_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: ds_store_b32 v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_system_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_store_b32 v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -9263,6 +11133,30 @@ define amdgpu_kernel void @local_system_one_as_monotonic_acquire_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_system_one_as_monotonic_acquire_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: ds_store_b32 v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_system_one_as_monotonic_acquire_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_store_b32 v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -9384,6 +11278,30 @@ define amdgpu_kernel void @local_system_one_as_acquire_acquire_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_system_one_as_acquire_acquire_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: ds_store_b32 v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_system_one_as_acquire_acquire_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_store_b32 v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -9505,6 +11423,30 @@ define amdgpu_kernel void @local_system_one_as_release_acquire_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_system_one_as_release_acquire_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: ds_store_b32 v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_system_one_as_release_acquire_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_store_b32 v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -9626,6 +11568,30 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_acquire_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_system_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: ds_store_b32 v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_system_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_store_b32 v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -9747,6 +11713,30 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_acquire_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_system_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: ds_store_b32 v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_system_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_store_b32 v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -9868,6 +11858,30 @@ define amdgpu_kernel void @local_system_one_as_monotonic_seq_cst_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_system_one_as_monotonic_seq_cst_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: ds_store_b32 v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_system_one_as_monotonic_seq_cst_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_store_b32 v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -9989,6 +12003,30 @@ define amdgpu_kernel void @local_system_one_as_acquire_seq_cst_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_system_one_as_acquire_seq_cst_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: ds_store_b32 v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_system_one_as_acquire_seq_cst_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_store_b32 v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -10110,6 +12148,30 @@ define amdgpu_kernel void @local_system_one_as_release_seq_cst_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_system_one_as_release_seq_cst_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: ds_store_b32 v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_system_one_as_release_seq_cst_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_store_b32 v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -10231,6 +12293,30 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_seq_cst_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_system_one_as_acq_rel_seq_cst_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: ds_store_b32 v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_system_one_as_acq_rel_seq_cst_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_store_b32 v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -10352,6 +12438,30 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_seq_cst_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_system_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: ds_store_b32 v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_system_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_store_b32 v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll
index 97aced3707af..98e0e013be5b 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll
@@ -4,6 +4,8 @@
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-WGP %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+cumode -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-CU %s
; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx700 -amdgcn-skip-cache-invalidations -verify-machineinstrs < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11-WGP %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+cumode -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11-CU %s
define amdgpu_kernel void @local_volatile_load_0(
; GFX6-LABEL: local_volatile_load_0:
@@ -73,6 +75,32 @@ define amdgpu_kernel void @local_volatile_load_0(
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_volatile_load_0:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-WGP-NEXT: ds_load_b32 v0, v0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_volatile_load_0:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-CU-NEXT: ds_load_b32 v0, v0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %in, i32 addrspace(1)* %out) {
entry:
%val = load volatile i32, i32 addrspace(3)* %in, align 4
@@ -151,6 +179,32 @@ define amdgpu_kernel void @local_volatile_load_1(
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_volatile_load_1:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_lshl_add_u32 v0, v0, 2, s2
+; GFX11-WGP-NEXT: ds_load_b32 v0, v0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_volatile_load_1:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_lshl_add_u32 v0, v0, 2, s2
+; GFX11-CU-NEXT: ds_load_b32 v0, v0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %in, i32 addrspace(1)* %out) {
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -225,6 +279,32 @@ define amdgpu_kernel void @local_volatile_store_0(
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1
; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_volatile_store_0:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: ds_store_b32 v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_volatile_store_0:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_load_b32 s1, s[2:3], 0x0
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: ds_store_b32 v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %in, i32 addrspace(3)* %out) {
entry:
%val = load i32, i32 addrspace(1)* %in, align 4
@@ -300,6 +380,32 @@ define amdgpu_kernel void @local_volatile_store_1(
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1
; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_volatile_store_1:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0
+; GFX11-WGP-NEXT: v_lshl_add_u32 v0, v0, 2, s0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: ds_store_b32 v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_volatile_store_1:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_load_b32 s1, s[2:3], 0x0
+; GFX11-CU-NEXT: v_lshl_add_u32 v0, v0, 2, s0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: ds_store_b32 v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %in, i32 addrspace(3)* %out) {
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -369,6 +475,29 @@ define amdgpu_kernel void @local_volatile_workgroup_acquire_load(
; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
; SKIP-CACHE-INV-NEXT: ds_write_b32 v1, v0
; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_volatile_workgroup_acquire_load:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: ds_load_b32 v0, v0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: ds_store_b32 v1, v0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_volatile_workgroup_acquire_load:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: ds_load_b32 v0, v0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_store_b32 v1, v0
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
entry:
%val = load atomic volatile i32, i32 addrspace(3)* %in syncscope("workgroup") acquire, align 4
@@ -431,6 +560,27 @@ define amdgpu_kernel void @local_volatile_workgroup_release_store(
; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1
; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_volatile_workgroup_release_store:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: ds_store_b32 v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_volatile_workgroup_release_store:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_store_b32 v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 %in, i32 addrspace(3)* %out) {
entry:
store atomic volatile i32 %in, i32 addrspace(3)* %out syncscope("workgroup") release, align 4
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-wavefront.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-wavefront.ll
index ba58a0532864..0c689597bc32 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-wavefront.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-wavefront.ll
@@ -8,6 +8,8 @@
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=+tgsplit -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-TGSPLIT %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-NOTTGSPLIT %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -mattr=+tgsplit -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-TGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11-WGP %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+cumode -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11-CU %s
define amdgpu_kernel void @local_wavefront_unordered_load(
; GFX6-LABEL: local_wavefront_unordered_load:
@@ -112,6 +114,28 @@ define amdgpu_kernel void @local_wavefront_unordered_load(
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: ds_write_b32 v1, v0
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_wavefront_unordered_load:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: ds_load_b32 v0, v0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: ds_store_b32 v1, v0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_wavefront_unordered_load:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: ds_load_b32 v0, v0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_store_b32 v1, v0
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
entry:
%val = load atomic i32, i32 addrspace(3)* %in syncscope("wavefront") unordered, align 4
@@ -222,6 +246,28 @@ define amdgpu_kernel void @local_wavefront_monotonic_load(
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: ds_write_b32 v1, v0
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_wavefront_monotonic_load:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: ds_load_b32 v0, v0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: ds_store_b32 v1, v0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_wavefront_monotonic_load:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: ds_load_b32 v0, v0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_store_b32 v1, v0
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
entry:
%val = load atomic i32, i32 addrspace(3)* %in syncscope("wavefront") monotonic, align 4
@@ -332,6 +378,28 @@ define amdgpu_kernel void @local_wavefront_acquire_load(
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: ds_write_b32 v1, v0
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_wavefront_acquire_load:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: ds_load_b32 v0, v0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: ds_store_b32 v1, v0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_wavefront_acquire_load:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: ds_load_b32 v0, v0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_store_b32 v1, v0
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
entry:
%val = load atomic i32, i32 addrspace(3)* %in syncscope("wavefront") acquire, align 4
@@ -442,6 +510,28 @@ define amdgpu_kernel void @local_wavefront_seq_cst_load(
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: ds_write_b32 v1, v0
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_wavefront_seq_cst_load:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: ds_load_b32 v0, v0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: ds_store_b32 v1, v0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_wavefront_seq_cst_load:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: ds_load_b32 v0, v0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_store_b32 v1, v0
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
entry:
%val = load atomic i32, i32 addrspace(3)* %in syncscope("wavefront") seq_cst, align 4
@@ -534,6 +624,24 @@ define amdgpu_kernel void @local_wavefront_unordered_store(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0
; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_wavefront_unordered_store:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0
+; GFX11-WGP-NEXT: ds_store_b32 v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_wavefront_unordered_store:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0
+; GFX11-CU-NEXT: ds_store_b32 v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 %in, i32 addrspace(3)* %out) {
entry:
store atomic i32 %in, i32 addrspace(3)* %out syncscope("wavefront") unordered, align 4
@@ -625,6 +733,24 @@ define amdgpu_kernel void @local_wavefront_monotonic_store(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0
; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_wavefront_monotonic_store:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0
+; GFX11-WGP-NEXT: ds_store_b32 v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_wavefront_monotonic_store:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0
+; GFX11-CU-NEXT: ds_store_b32 v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 %in, i32 addrspace(3)* %out) {
entry:
store atomic i32 %in, i32 addrspace(3)* %out syncscope("wavefront") monotonic, align 4
@@ -716,6 +842,24 @@ define amdgpu_kernel void @local_wavefront_release_store(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0
; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_wavefront_release_store:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0
+; GFX11-WGP-NEXT: ds_store_b32 v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_wavefront_release_store:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0
+; GFX11-CU-NEXT: ds_store_b32 v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 %in, i32 addrspace(3)* %out) {
entry:
store atomic i32 %in, i32 addrspace(3)* %out syncscope("wavefront") release, align 4
@@ -807,6 +951,24 @@ define amdgpu_kernel void @local_wavefront_seq_cst_store(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0
; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_wavefront_seq_cst_store:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0
+; GFX11-WGP-NEXT: ds_store_b32 v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_wavefront_seq_cst_store:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0
+; GFX11-CU-NEXT: ds_store_b32 v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 %in, i32 addrspace(3)* %out) {
entry:
store atomic i32 %in, i32 addrspace(3)* %out syncscope("wavefront") seq_cst, align 4
@@ -898,6 +1060,24 @@ define amdgpu_kernel void @local_wavefront_monotonic_atomicrmw(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
; GFX940-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_wavefront_monotonic_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_wavefront_monotonic_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("wavefront") monotonic
@@ -989,6 +1169,24 @@ define amdgpu_kernel void @local_wavefront_acquire_atomicrmw(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
; GFX940-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_wavefront_acquire_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_wavefront_acquire_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("wavefront") acquire
@@ -1080,6 +1278,24 @@ define amdgpu_kernel void @local_wavefront_release_atomicrmw(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
; GFX940-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_wavefront_release_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_wavefront_release_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("wavefront") release
@@ -1171,6 +1387,24 @@ define amdgpu_kernel void @local_wavefront_acq_rel_atomicrmw(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
; GFX940-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_wavefront_acq_rel_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_wavefront_acq_rel_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("wavefront") acq_rel
@@ -1262,6 +1496,24 @@ define amdgpu_kernel void @local_wavefront_seq_cst_atomicrmw(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
; GFX940-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_wavefront_seq_cst_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_wavefront_seq_cst_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("wavefront") seq_cst
@@ -1371,6 +1623,28 @@ define amdgpu_kernel void @local_wavefront_acquire_ret_atomicrmw(
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_wavefront_acquire_ret_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v1, v0, v1
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: ds_store_b32 v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_wavefront_acquire_ret_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_store_b32 v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("wavefront") acquire
@@ -1481,6 +1755,28 @@ define amdgpu_kernel void @local_wavefront_acq_rel_ret_atomicrmw(
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_wavefront_acq_rel_ret_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v1, v0, v1
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: ds_store_b32 v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_wavefront_acq_rel_ret_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_store_b32 v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("wavefront") acq_rel
@@ -1591,6 +1887,28 @@ define amdgpu_kernel void @local_wavefront_seq_cst_ret_atomicrmw(
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_wavefront_seq_cst_ret_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v1, v0, v1
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: ds_store_b32 v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_wavefront_seq_cst_ret_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_store_b32 v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("wavefront") seq_cst
@@ -1692,6 +2010,26 @@ define amdgpu_kernel void @local_wavefront_monotonic_monotonic_cmpxchg(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_wavefront_monotonic_monotonic_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_wavefront_monotonic_monotonic_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -1793,6 +2131,26 @@ define amdgpu_kernel void @local_wavefront_acquire_monotonic_cmpxchg(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_wavefront_acquire_monotonic_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_wavefront_acquire_monotonic_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -1894,6 +2252,26 @@ define amdgpu_kernel void @local_wavefront_release_monotonic_cmpxchg(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_wavefront_release_monotonic_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_wavefront_release_monotonic_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -1995,6 +2373,26 @@ define amdgpu_kernel void @local_wavefront_acq_rel_monotonic_cmpxchg(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_wavefront_acq_rel_monotonic_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_wavefront_acq_rel_monotonic_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -2096,6 +2494,26 @@ define amdgpu_kernel void @local_wavefront_seq_cst_monotonic_cmpxchg(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_wavefront_seq_cst_monotonic_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_wavefront_seq_cst_monotonic_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -2197,6 +2615,26 @@ define amdgpu_kernel void @local_wavefront_monotonic_acquire_cmpxchg(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_wavefront_monotonic_acquire_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_wavefront_monotonic_acquire_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -2298,6 +2736,26 @@ define amdgpu_kernel void @local_wavefront_acquire_acquire_cmpxchg(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_wavefront_acquire_acquire_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_wavefront_acquire_acquire_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -2399,6 +2857,26 @@ define amdgpu_kernel void @local_wavefront_release_acquire_cmpxchg(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_wavefront_release_acquire_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_wavefront_release_acquire_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -2500,6 +2978,26 @@ define amdgpu_kernel void @local_wavefront_acq_rel_acquire_cmpxchg(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_wavefront_acq_rel_acquire_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_wavefront_acq_rel_acquire_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -2601,6 +3099,26 @@ define amdgpu_kernel void @local_wavefront_seq_cst_acquire_cmpxchg(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_wavefront_seq_cst_acquire_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_wavefront_seq_cst_acquire_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -2702,6 +3220,26 @@ define amdgpu_kernel void @local_wavefront_monotonic_seq_cst_cmpxchg(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_wavefront_monotonic_seq_cst_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_wavefront_monotonic_seq_cst_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -2803,6 +3341,26 @@ define amdgpu_kernel void @local_wavefront_acquire_seq_cst_cmpxchg(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_wavefront_acquire_seq_cst_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_wavefront_acquire_seq_cst_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -2904,6 +3462,26 @@ define amdgpu_kernel void @local_wavefront_release_seq_cst_cmpxchg(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_wavefront_release_seq_cst_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_wavefront_release_seq_cst_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -3005,6 +3583,26 @@ define amdgpu_kernel void @local_wavefront_acq_rel_seq_cst_cmpxchg(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_wavefront_acq_rel_seq_cst_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_wavefront_acq_rel_seq_cst_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -3106,6 +3704,26 @@ define amdgpu_kernel void @local_wavefront_seq_cst_seq_cst_cmpxchg(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_wavefront_seq_cst_seq_cst_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_wavefront_seq_cst_seq_cst_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -3225,6 +3843,30 @@ define amdgpu_kernel void @local_wavefront_monotonic_monotonic_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_wavefront_monotonic_monotonic_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: ds_store_b32 v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_wavefront_monotonic_monotonic_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_store_b32 v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -3346,6 +3988,30 @@ define amdgpu_kernel void @local_wavefront_acquire_monotonic_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_wavefront_acquire_monotonic_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: ds_store_b32 v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_wavefront_acquire_monotonic_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_store_b32 v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -3467,6 +4133,30 @@ define amdgpu_kernel void @local_wavefront_release_monotonic_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_wavefront_release_monotonic_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: ds_store_b32 v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_wavefront_release_monotonic_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_store_b32 v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -3588,6 +4278,30 @@ define amdgpu_kernel void @local_wavefront_acq_rel_monotonic_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_wavefront_acq_rel_monotonic_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: ds_store_b32 v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_wavefront_acq_rel_monotonic_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_store_b32 v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -3709,6 +4423,30 @@ define amdgpu_kernel void @local_wavefront_seq_cst_monotonic_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_wavefront_seq_cst_monotonic_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: ds_store_b32 v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_wavefront_seq_cst_monotonic_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_store_b32 v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -3830,6 +4568,30 @@ define amdgpu_kernel void @local_wavefront_monotonic_acquire_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_wavefront_monotonic_acquire_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: ds_store_b32 v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_wavefront_monotonic_acquire_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_store_b32 v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -3951,6 +4713,30 @@ define amdgpu_kernel void @local_wavefront_acquire_acquire_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_wavefront_acquire_acquire_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: ds_store_b32 v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_wavefront_acquire_acquire_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_store_b32 v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -4072,6 +4858,30 @@ define amdgpu_kernel void @local_wavefront_release_acquire_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_wavefront_release_acquire_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: ds_store_b32 v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_wavefront_release_acquire_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_store_b32 v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -4193,6 +5003,30 @@ define amdgpu_kernel void @local_wavefront_acq_rel_acquire_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_wavefront_acq_rel_acquire_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: ds_store_b32 v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_wavefront_acq_rel_acquire_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_store_b32 v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -4314,6 +5148,30 @@ define amdgpu_kernel void @local_wavefront_seq_cst_acquire_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_wavefront_seq_cst_acquire_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: ds_store_b32 v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_wavefront_seq_cst_acquire_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_store_b32 v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -4435,6 +5293,30 @@ define amdgpu_kernel void @local_wavefront_monotonic_seq_cst_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_wavefront_monotonic_seq_cst_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: ds_store_b32 v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_wavefront_monotonic_seq_cst_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_store_b32 v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -4556,6 +5438,30 @@ define amdgpu_kernel void @local_wavefront_acquire_seq_cst_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_wavefront_acquire_seq_cst_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: ds_store_b32 v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_wavefront_acquire_seq_cst_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_store_b32 v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -4677,6 +5583,30 @@ define amdgpu_kernel void @local_wavefront_release_seq_cst_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_wavefront_release_seq_cst_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: ds_store_b32 v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_wavefront_release_seq_cst_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_store_b32 v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -4798,6 +5728,30 @@ define amdgpu_kernel void @local_wavefront_acq_rel_seq_cst_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_wavefront_acq_rel_seq_cst_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: ds_store_b32 v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_wavefront_acq_rel_seq_cst_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_store_b32 v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -4919,6 +5873,30 @@ define amdgpu_kernel void @local_wavefront_seq_cst_seq_cst_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_wavefront_seq_cst_seq_cst_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: ds_store_b32 v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_wavefront_seq_cst_seq_cst_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_store_b32 v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -5031,6 +6009,28 @@ define amdgpu_kernel void @local_wavefront_one_as_unordered_load(
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: ds_write_b32 v1, v0
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_wavefront_one_as_unordered_load:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: ds_load_b32 v0, v0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: ds_store_b32 v1, v0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_wavefront_one_as_unordered_load:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: ds_load_b32 v0, v0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_store_b32 v1, v0
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
entry:
%val = load atomic i32, i32 addrspace(3)* %in syncscope("wavefront-one-as") unordered, align 4
@@ -5141,6 +6141,28 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_load(
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: ds_write_b32 v1, v0
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_wavefront_one_as_monotonic_load:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: ds_load_b32 v0, v0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: ds_store_b32 v1, v0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_wavefront_one_as_monotonic_load:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: ds_load_b32 v0, v0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_store_b32 v1, v0
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
entry:
%val = load atomic i32, i32 addrspace(3)* %in syncscope("wavefront-one-as") monotonic, align 4
@@ -5251,6 +6273,28 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_load(
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: ds_write_b32 v1, v0
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_wavefront_one_as_acquire_load:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: ds_load_b32 v0, v0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: ds_store_b32 v1, v0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_wavefront_one_as_acquire_load:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: ds_load_b32 v0, v0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_store_b32 v1, v0
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
entry:
%val = load atomic i32, i32 addrspace(3)* %in syncscope("wavefront-one-as") acquire, align 4
@@ -5361,6 +6405,28 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_load(
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: ds_write_b32 v1, v0
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_wavefront_one_as_seq_cst_load:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: ds_load_b32 v0, v0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: ds_store_b32 v1, v0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_wavefront_one_as_seq_cst_load:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: ds_load_b32 v0, v0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_store_b32 v1, v0
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
entry:
%val = load atomic i32, i32 addrspace(3)* %in syncscope("wavefront-one-as") seq_cst, align 4
@@ -5453,6 +6519,24 @@ define amdgpu_kernel void @local_wavefront_one_as_unordered_store(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0
; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_wavefront_one_as_unordered_store:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0
+; GFX11-WGP-NEXT: ds_store_b32 v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_wavefront_one_as_unordered_store:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0
+; GFX11-CU-NEXT: ds_store_b32 v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 %in, i32 addrspace(3)* %out) {
entry:
store atomic i32 %in, i32 addrspace(3)* %out syncscope("wavefront-one-as") unordered, align 4
@@ -5544,6 +6628,24 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_store(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0
; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_wavefront_one_as_monotonic_store:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0
+; GFX11-WGP-NEXT: ds_store_b32 v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_wavefront_one_as_monotonic_store:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0
+; GFX11-CU-NEXT: ds_store_b32 v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 %in, i32 addrspace(3)* %out) {
entry:
store atomic i32 %in, i32 addrspace(3)* %out syncscope("wavefront-one-as") monotonic, align 4
@@ -5635,6 +6737,24 @@ define amdgpu_kernel void @local_wavefront_one_as_release_store(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0
; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_wavefront_one_as_release_store:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0
+; GFX11-WGP-NEXT: ds_store_b32 v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_wavefront_one_as_release_store:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0
+; GFX11-CU-NEXT: ds_store_b32 v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 %in, i32 addrspace(3)* %out) {
entry:
store atomic i32 %in, i32 addrspace(3)* %out syncscope("wavefront-one-as") release, align 4
@@ -5726,6 +6846,24 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_store(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0
; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_wavefront_one_as_seq_cst_store:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0
+; GFX11-WGP-NEXT: ds_store_b32 v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_wavefront_one_as_seq_cst_store:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0
+; GFX11-CU-NEXT: ds_store_b32 v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 %in, i32 addrspace(3)* %out) {
entry:
store atomic i32 %in, i32 addrspace(3)* %out syncscope("wavefront-one-as") seq_cst, align 4
@@ -5817,6 +6955,24 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_atomicrmw(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
; GFX940-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_wavefront_one_as_monotonic_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_wavefront_one_as_monotonic_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("wavefront-one-as") monotonic
@@ -5908,6 +7064,24 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_atomicrmw(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
; GFX940-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_wavefront_one_as_acquire_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_wavefront_one_as_acquire_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("wavefront-one-as") acquire
@@ -5999,6 +7173,24 @@ define amdgpu_kernel void @local_wavefront_one_as_release_atomicrmw(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
; GFX940-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_wavefront_one_as_release_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_wavefront_one_as_release_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("wavefront-one-as") release
@@ -6090,6 +7282,24 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_atomicrmw(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
; GFX940-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_wavefront_one_as_acq_rel_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_wavefront_one_as_acq_rel_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("wavefront-one-as") acq_rel
@@ -6181,6 +7391,24 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_atomicrmw(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
; GFX940-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_wavefront_one_as_seq_cst_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_wavefront_one_as_seq_cst_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("wavefront-one-as") seq_cst
@@ -6290,6 +7518,28 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_ret_atomicrmw(
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_wavefront_one_as_acquire_ret_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v1, v0, v1
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: ds_store_b32 v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_wavefront_one_as_acquire_ret_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_store_b32 v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("wavefront-one-as") acquire
@@ -6400,6 +7650,28 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_ret_atomicrmw(
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_wavefront_one_as_acq_rel_ret_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v1, v0, v1
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: ds_store_b32 v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_wavefront_one_as_acq_rel_ret_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_store_b32 v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("wavefront-one-as") acq_rel
@@ -6510,6 +7782,28 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_ret_atomicrmw(
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_wavefront_one_as_seq_cst_ret_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v1, v0, v1
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: ds_store_b32 v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_wavefront_one_as_seq_cst_ret_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_store_b32 v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("wavefront-one-as") seq_cst
@@ -6611,6 +7905,26 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_monotonic_cmpxchg(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_wavefront_one_as_monotonic_monotonic_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_wavefront_one_as_monotonic_monotonic_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -6712,6 +8026,26 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_monotonic_cmpxchg(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_wavefront_one_as_acquire_monotonic_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_wavefront_one_as_acquire_monotonic_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -6813,6 +8147,26 @@ define amdgpu_kernel void @local_wavefront_one_as_release_monotonic_cmpxchg(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_wavefront_one_as_release_monotonic_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_wavefront_one_as_release_monotonic_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -6914,6 +8268,26 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_monotonic_cmpxchg(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_wavefront_one_as_acq_rel_monotonic_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_wavefront_one_as_acq_rel_monotonic_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -7015,6 +8389,26 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_monotonic_cmpxchg(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_wavefront_one_as_seq_cst_monotonic_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_wavefront_one_as_seq_cst_monotonic_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -7116,6 +8510,26 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_acquire_cmpxchg(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_wavefront_one_as_monotonic_acquire_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_wavefront_one_as_monotonic_acquire_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -7217,6 +8631,26 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_acquire_cmpxchg(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_wavefront_one_as_acquire_acquire_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_wavefront_one_as_acquire_acquire_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -7318,6 +8752,26 @@ define amdgpu_kernel void @local_wavefront_one_as_release_acquire_cmpxchg(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_wavefront_one_as_release_acquire_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_wavefront_one_as_release_acquire_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -7419,6 +8873,26 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_acquire_cmpxchg(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_wavefront_one_as_acq_rel_acquire_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_wavefront_one_as_acq_rel_acquire_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -7520,6 +8994,26 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_acquire_cmpxchg(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_wavefront_one_as_seq_cst_acquire_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_wavefront_one_as_seq_cst_acquire_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -7621,6 +9115,26 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_seq_cst_cmpxchg(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_wavefront_one_as_monotonic_seq_cst_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_wavefront_one_as_monotonic_seq_cst_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -7722,6 +9236,26 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_seq_cst_cmpxchg(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_wavefront_one_as_acquire_seq_cst_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_wavefront_one_as_acquire_seq_cst_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -7823,6 +9357,26 @@ define amdgpu_kernel void @local_wavefront_one_as_release_seq_cst_cmpxchg(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_wavefront_one_as_release_seq_cst_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_wavefront_one_as_release_seq_cst_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -7924,6 +9478,26 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_seq_cst_cmpxchg(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_wavefront_one_as_acq_rel_seq_cst_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_wavefront_one_as_acq_rel_seq_cst_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -8025,6 +9599,26 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_seq_cst_cmpxchg(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_wavefront_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_wavefront_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -8144,6 +9738,30 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_monotonic_ret_cmpxch
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_wavefront_one_as_monotonic_monotonic_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: ds_store_b32 v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_wavefront_one_as_monotonic_monotonic_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_store_b32 v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -8265,6 +9883,30 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_monotonic_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_wavefront_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: ds_store_b32 v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_wavefront_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_store_b32 v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -8386,6 +10028,30 @@ define amdgpu_kernel void @local_wavefront_one_as_release_monotonic_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_wavefront_one_as_release_monotonic_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: ds_store_b32 v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_wavefront_one_as_release_monotonic_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_store_b32 v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -8507,6 +10173,30 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: ds_store_b32 v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_store_b32 v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -8628,6 +10318,30 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: ds_store_b32 v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_store_b32 v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -8749,6 +10463,30 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_acquire_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_wavefront_one_as_monotonic_acquire_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: ds_store_b32 v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_wavefront_one_as_monotonic_acquire_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_store_b32 v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -8870,6 +10608,30 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_acquire_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_wavefront_one_as_acquire_acquire_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: ds_store_b32 v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_wavefront_one_as_acquire_acquire_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_store_b32 v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -8991,6 +10753,30 @@ define amdgpu_kernel void @local_wavefront_one_as_release_acquire_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_wavefront_one_as_release_acquire_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: ds_store_b32 v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_wavefront_one_as_release_acquire_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_store_b32 v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -9112,6 +10898,30 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_acquire_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_wavefront_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: ds_store_b32 v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_wavefront_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_store_b32 v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -9233,6 +11043,30 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_acquire_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_wavefront_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: ds_store_b32 v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_wavefront_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_store_b32 v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -9354,6 +11188,30 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: ds_store_b32 v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_store_b32 v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -9475,6 +11333,30 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_seq_cst_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_wavefront_one_as_acquire_seq_cst_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: ds_store_b32 v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_wavefront_one_as_acquire_seq_cst_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_store_b32 v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -9596,6 +11478,30 @@ define amdgpu_kernel void @local_wavefront_one_as_release_seq_cst_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_wavefront_one_as_release_seq_cst_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: ds_store_b32 v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_wavefront_one_as_release_seq_cst_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_store_b32 v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -9717,6 +11623,30 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: ds_store_b32 v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_store_b32 v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -9838,6 +11768,30 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: ds_store_b32 v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_store_b32 v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-workgroup.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-workgroup.ll
index 0cfa962ec84c..c8bbfcb76c95 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-workgroup.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-workgroup.ll
@@ -8,6 +8,8 @@
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=+tgsplit -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-TGSPLIT %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-NOTTGSPLIT %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -mattr=+tgsplit -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-TGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11-WGP %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+cumode -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11-CU %s
define amdgpu_kernel void @local_workgroup_unordered_load(
; GFX6-LABEL: local_workgroup_unordered_load:
@@ -112,6 +114,28 @@ define amdgpu_kernel void @local_workgroup_unordered_load(
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: ds_write_b32 v1, v0
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_workgroup_unordered_load:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: ds_load_b32 v0, v0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: ds_store_b32 v1, v0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_workgroup_unordered_load:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: ds_load_b32 v0, v0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_store_b32 v1, v0
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
entry:
%val = load atomic i32, i32 addrspace(3)* %in syncscope("workgroup") unordered, align 4
@@ -222,6 +246,28 @@ define amdgpu_kernel void @local_workgroup_monotonic_load(
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: ds_write_b32 v1, v0
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_workgroup_monotonic_load:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: ds_load_b32 v0, v0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: ds_store_b32 v1, v0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_workgroup_monotonic_load:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: ds_load_b32 v0, v0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_store_b32 v1, v0
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
entry:
%val = load atomic i32, i32 addrspace(3)* %in syncscope("workgroup") monotonic, align 4
@@ -335,6 +381,29 @@ define amdgpu_kernel void @local_workgroup_acquire_load(
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: ds_write_b32 v1, v0
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_workgroup_acquire_load:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: ds_load_b32 v0, v0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: ds_store_b32 v1, v0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_workgroup_acquire_load:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: ds_load_b32 v0, v0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_store_b32 v1, v0
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
entry:
%val = load atomic i32, i32 addrspace(3)* %in syncscope("workgroup") acquire, align 4
@@ -458,6 +527,32 @@ define amdgpu_kernel void @local_workgroup_seq_cst_load(
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: ds_write_b32 v1, v0
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_workgroup_seq_cst_load:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: ds_load_b32 v0, v0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: ds_store_b32 v1, v0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_workgroup_seq_cst_load:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_load_b32 v0, v0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_store_b32 v1, v0
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
entry:
%val = load atomic i32, i32 addrspace(3)* %in syncscope("workgroup") seq_cst, align 4
@@ -550,6 +645,24 @@ define amdgpu_kernel void @local_workgroup_unordered_store(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0
; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_workgroup_unordered_store:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0
+; GFX11-WGP-NEXT: ds_store_b32 v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_workgroup_unordered_store:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0
+; GFX11-CU-NEXT: ds_store_b32 v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 %in, i32 addrspace(3)* %out) {
entry:
store atomic i32 %in, i32 addrspace(3)* %out syncscope("workgroup") unordered, align 4
@@ -641,6 +754,24 @@ define amdgpu_kernel void @local_workgroup_monotonic_store(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0
; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_workgroup_monotonic_store:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0
+; GFX11-WGP-NEXT: ds_store_b32 v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_workgroup_monotonic_store:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0
+; GFX11-CU-NEXT: ds_store_b32 v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 %in, i32 addrspace(3)* %out) {
entry:
store atomic i32 %in, i32 addrspace(3)* %out syncscope("workgroup") monotonic, align 4
@@ -742,6 +873,27 @@ define amdgpu_kernel void @local_workgroup_release_store(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_workgroup_release_store:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: ds_store_b32 v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_workgroup_release_store:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_store_b32 v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 %in, i32 addrspace(3)* %out) {
entry:
store atomic i32 %in, i32 addrspace(3)* %out syncscope("workgroup") release, align 4
@@ -843,6 +995,27 @@ define amdgpu_kernel void @local_workgroup_seq_cst_store(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_workgroup_seq_cst_store:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: ds_store_b32 v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_workgroup_seq_cst_store:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_store_b32 v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 %in, i32 addrspace(3)* %out) {
entry:
store atomic i32 %in, i32 addrspace(3)* %out syncscope("workgroup") seq_cst, align 4
@@ -934,6 +1107,24 @@ define amdgpu_kernel void @local_workgroup_monotonic_atomicrmw(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
; GFX940-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_workgroup_monotonic_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_workgroup_monotonic_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("workgroup") monotonic
@@ -1035,6 +1226,27 @@ define amdgpu_kernel void @local_workgroup_acquire_atomicrmw(
; GFX940-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
; GFX940-TGSPLIT-NEXT: buffer_inv sc0
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_workgroup_acquire_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_workgroup_acquire_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("workgroup") acquire
@@ -1136,6 +1348,27 @@ define amdgpu_kernel void @local_workgroup_release_atomicrmw(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_workgroup_release_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_workgroup_release_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("workgroup") release
@@ -1247,6 +1480,30 @@ define amdgpu_kernel void @local_workgroup_acq_rel_atomicrmw(
; GFX940-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
; GFX940-TGSPLIT-NEXT: buffer_inv sc0
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_workgroup_acq_rel_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_workgroup_acq_rel_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("workgroup") acq_rel
@@ -1358,6 +1615,30 @@ define amdgpu_kernel void @local_workgroup_seq_cst_atomicrmw(
; GFX940-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
; GFX940-TGSPLIT-NEXT: buffer_inv sc0
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_workgroup_seq_cst_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_workgroup_seq_cst_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("workgroup") seq_cst
@@ -1470,6 +1751,29 @@ define amdgpu_kernel void @local_workgroup_acquire_ret_atomicrmw(
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_workgroup_acquire_ret_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v1, v0, v1
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: ds_store_b32 v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_workgroup_acquire_ret_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_store_b32 v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("workgroup") acquire
@@ -1593,6 +1897,32 @@ define amdgpu_kernel void @local_workgroup_acq_rel_ret_atomicrmw(
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_workgroup_acq_rel_ret_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v1, v0, v1
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: ds_store_b32 v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_workgroup_acq_rel_ret_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_store_b32 v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("workgroup") acq_rel
@@ -1716,6 +2046,32 @@ define amdgpu_kernel void @local_workgroup_seq_cst_ret_atomicrmw(
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_workgroup_seq_cst_ret_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v1, v0, v1
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: ds_store_b32 v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_workgroup_seq_cst_ret_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_store_b32 v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("workgroup") seq_cst
@@ -1817,6 +2173,26 @@ define amdgpu_kernel void @local_workgroup_monotonic_monotonic_cmpxchg(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_workgroup_monotonic_monotonic_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_workgroup_monotonic_monotonic_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -1928,6 +2304,29 @@ define amdgpu_kernel void @local_workgroup_acquire_monotonic_cmpxchg(
; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX940-TGSPLIT-NEXT: buffer_inv sc0
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_workgroup_acquire_monotonic_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_workgroup_acquire_monotonic_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -2039,6 +2438,29 @@ define amdgpu_kernel void @local_workgroup_release_monotonic_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_workgroup_release_monotonic_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_workgroup_release_monotonic_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -2160,6 +2582,32 @@ define amdgpu_kernel void @local_workgroup_acq_rel_monotonic_cmpxchg(
; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX940-TGSPLIT-NEXT: buffer_inv sc0
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_workgroup_acq_rel_monotonic_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_workgroup_acq_rel_monotonic_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -2281,6 +2729,32 @@ define amdgpu_kernel void @local_workgroup_seq_cst_monotonic_cmpxchg(
; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX940-TGSPLIT-NEXT: buffer_inv sc0
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_workgroup_seq_cst_monotonic_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_workgroup_seq_cst_monotonic_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -2392,6 +2866,29 @@ define amdgpu_kernel void @local_workgroup_monotonic_acquire_cmpxchg(
; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX940-TGSPLIT-NEXT: buffer_inv sc0
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_workgroup_monotonic_acquire_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_workgroup_monotonic_acquire_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -2503,6 +3000,29 @@ define amdgpu_kernel void @local_workgroup_acquire_acquire_cmpxchg(
; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX940-TGSPLIT-NEXT: buffer_inv sc0
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_workgroup_acquire_acquire_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_workgroup_acquire_acquire_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -2624,6 +3144,32 @@ define amdgpu_kernel void @local_workgroup_release_acquire_cmpxchg(
; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX940-TGSPLIT-NEXT: buffer_inv sc0
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_workgroup_release_acquire_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_workgroup_release_acquire_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -2745,6 +3291,32 @@ define amdgpu_kernel void @local_workgroup_acq_rel_acquire_cmpxchg(
; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX940-TGSPLIT-NEXT: buffer_inv sc0
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_workgroup_acq_rel_acquire_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_workgroup_acq_rel_acquire_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -2866,6 +3438,32 @@ define amdgpu_kernel void @local_workgroup_seq_cst_acquire_cmpxchg(
; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX940-TGSPLIT-NEXT: buffer_inv sc0
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_workgroup_seq_cst_acquire_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_workgroup_seq_cst_acquire_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -2987,6 +3585,32 @@ define amdgpu_kernel void @local_workgroup_monotonic_seq_cst_cmpxchg(
; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX940-TGSPLIT-NEXT: buffer_inv sc0
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_workgroup_monotonic_seq_cst_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_workgroup_monotonic_seq_cst_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -3108,6 +3732,32 @@ define amdgpu_kernel void @local_workgroup_acquire_seq_cst_cmpxchg(
; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX940-TGSPLIT-NEXT: buffer_inv sc0
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_workgroup_acquire_seq_cst_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_workgroup_acquire_seq_cst_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -3229,6 +3879,32 @@ define amdgpu_kernel void @local_workgroup_release_seq_cst_cmpxchg(
; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX940-TGSPLIT-NEXT: buffer_inv sc0
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_workgroup_release_seq_cst_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_workgroup_release_seq_cst_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -3350,6 +4026,32 @@ define amdgpu_kernel void @local_workgroup_acq_rel_seq_cst_cmpxchg(
; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX940-TGSPLIT-NEXT: buffer_inv sc0
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_workgroup_acq_rel_seq_cst_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_workgroup_acq_rel_seq_cst_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -3471,6 +4173,32 @@ define amdgpu_kernel void @local_workgroup_seq_cst_seq_cst_cmpxchg(
; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX940-TGSPLIT-NEXT: buffer_inv sc0
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_workgroup_seq_cst_seq_cst_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_workgroup_seq_cst_seq_cst_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -3590,6 +4318,30 @@ define amdgpu_kernel void @local_workgroup_monotonic_monotonic_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_workgroup_monotonic_monotonic_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: ds_store_b32 v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_workgroup_monotonic_monotonic_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_store_b32 v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -3714,6 +4466,31 @@ define amdgpu_kernel void @local_workgroup_acquire_monotonic_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_workgroup_acquire_monotonic_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: ds_store_b32 v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_workgroup_acquire_monotonic_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_store_b32 v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -3845,6 +4622,33 @@ define amdgpu_kernel void @local_workgroup_release_monotonic_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_workgroup_release_monotonic_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: ds_store_b32 v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_workgroup_release_monotonic_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_store_b32 v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -3979,6 +4783,34 @@ define amdgpu_kernel void @local_workgroup_acq_rel_monotonic_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_workgroup_acq_rel_monotonic_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: ds_store_b32 v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_workgroup_acq_rel_monotonic_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_store_b32 v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -4113,6 +4945,34 @@ define amdgpu_kernel void @local_workgroup_seq_cst_monotonic_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_workgroup_seq_cst_monotonic_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: ds_store_b32 v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_workgroup_seq_cst_monotonic_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_store_b32 v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -4237,6 +5097,31 @@ define amdgpu_kernel void @local_workgroup_monotonic_acquire_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_workgroup_monotonic_acquire_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: ds_store_b32 v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_workgroup_monotonic_acquire_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_store_b32 v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -4361,6 +5246,31 @@ define amdgpu_kernel void @local_workgroup_acquire_acquire_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_workgroup_acquire_acquire_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: ds_store_b32 v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_workgroup_acquire_acquire_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_store_b32 v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -4495,6 +5405,34 @@ define amdgpu_kernel void @local_workgroup_release_acquire_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_workgroup_release_acquire_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: ds_store_b32 v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_workgroup_release_acquire_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_store_b32 v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -4629,6 +5567,34 @@ define amdgpu_kernel void @local_workgroup_acq_rel_acquire_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_workgroup_acq_rel_acquire_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: ds_store_b32 v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_workgroup_acq_rel_acquire_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_store_b32 v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -4763,6 +5729,34 @@ define amdgpu_kernel void @local_workgroup_seq_cst_acquire_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_workgroup_seq_cst_acquire_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: ds_store_b32 v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_workgroup_seq_cst_acquire_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_store_b32 v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -4897,6 +5891,34 @@ define amdgpu_kernel void @local_workgroup_monotonic_seq_cst_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_workgroup_monotonic_seq_cst_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: ds_store_b32 v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_workgroup_monotonic_seq_cst_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_store_b32 v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -5031,6 +6053,34 @@ define amdgpu_kernel void @local_workgroup_acquire_seq_cst_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_workgroup_acquire_seq_cst_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: ds_store_b32 v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_workgroup_acquire_seq_cst_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_store_b32 v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -5165,6 +6215,34 @@ define amdgpu_kernel void @local_workgroup_release_seq_cst_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_workgroup_release_seq_cst_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: ds_store_b32 v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_workgroup_release_seq_cst_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_store_b32 v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -5299,6 +6377,34 @@ define amdgpu_kernel void @local_workgroup_acq_rel_seq_cst_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_workgroup_acq_rel_seq_cst_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: ds_store_b32 v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_workgroup_acq_rel_seq_cst_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_store_b32 v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -5433,6 +6539,34 @@ define amdgpu_kernel void @local_workgroup_seq_cst_seq_cst_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_workgroup_seq_cst_seq_cst_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: buffer_gl0_inv
+; GFX11-WGP-NEXT: ds_store_b32 v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_workgroup_seq_cst_seq_cst_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_store_b32 v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -5545,6 +6679,28 @@ define amdgpu_kernel void @local_workgroup_one_as_unordered_load(
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: ds_write_b32 v1, v0
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_workgroup_one_as_unordered_load:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: ds_load_b32 v0, v0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: ds_store_b32 v1, v0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_workgroup_one_as_unordered_load:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: ds_load_b32 v0, v0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_store_b32 v1, v0
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
entry:
%val = load atomic i32, i32 addrspace(3)* %in syncscope("workgroup-one-as") unordered, align 4
@@ -5655,6 +6811,28 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_load(
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: ds_write_b32 v1, v0
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_workgroup_one_as_monotonic_load:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: ds_load_b32 v0, v0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: ds_store_b32 v1, v0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_workgroup_one_as_monotonic_load:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: ds_load_b32 v0, v0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_store_b32 v1, v0
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
entry:
%val = load atomic i32, i32 addrspace(3)* %in syncscope("workgroup-one-as") monotonic, align 4
@@ -5765,6 +6943,28 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_load(
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: ds_write_b32 v1, v0
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_workgroup_one_as_acquire_load:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: ds_load_b32 v0, v0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: ds_store_b32 v1, v0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_workgroup_one_as_acquire_load:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: ds_load_b32 v0, v0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_store_b32 v1, v0
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
entry:
%val = load atomic i32, i32 addrspace(3)* %in syncscope("workgroup-one-as") acquire, align 4
@@ -5875,6 +7075,28 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_load(
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: ds_write_b32 v1, v0
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_workgroup_one_as_seq_cst_load:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: ds_load_b32 v0, v0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: ds_store_b32 v1, v0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_workgroup_one_as_seq_cst_load:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: ds_load_b32 v0, v0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_store_b32 v1, v0
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
entry:
%val = load atomic i32, i32 addrspace(3)* %in syncscope("workgroup-one-as") seq_cst, align 4
@@ -5967,6 +7189,24 @@ define amdgpu_kernel void @local_workgroup_one_as_unordered_store(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0
; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_workgroup_one_as_unordered_store:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0
+; GFX11-WGP-NEXT: ds_store_b32 v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_workgroup_one_as_unordered_store:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0
+; GFX11-CU-NEXT: ds_store_b32 v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 %in, i32 addrspace(3)* %out) {
entry:
store atomic i32 %in, i32 addrspace(3)* %out syncscope("workgroup-one-as") unordered, align 4
@@ -6058,6 +7298,24 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_store(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0
; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_workgroup_one_as_monotonic_store:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0
+; GFX11-WGP-NEXT: ds_store_b32 v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_workgroup_one_as_monotonic_store:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0
+; GFX11-CU-NEXT: ds_store_b32 v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 %in, i32 addrspace(3)* %out) {
entry:
store atomic i32 %in, i32 addrspace(3)* %out syncscope("workgroup-one-as") monotonic, align 4
@@ -6149,6 +7407,24 @@ define amdgpu_kernel void @local_workgroup_one_as_release_store(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0
; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_workgroup_one_as_release_store:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0
+; GFX11-WGP-NEXT: ds_store_b32 v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_workgroup_one_as_release_store:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0
+; GFX11-CU-NEXT: ds_store_b32 v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 %in, i32 addrspace(3)* %out) {
entry:
store atomic i32 %in, i32 addrspace(3)* %out syncscope("workgroup-one-as") release, align 4
@@ -6240,6 +7516,24 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_store(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0
; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_workgroup_one_as_seq_cst_store:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0
+; GFX11-WGP-NEXT: ds_store_b32 v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_workgroup_one_as_seq_cst_store:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0
+; GFX11-CU-NEXT: ds_store_b32 v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 %in, i32 addrspace(3)* %out) {
entry:
store atomic i32 %in, i32 addrspace(3)* %out syncscope("workgroup-one-as") seq_cst, align 4
@@ -6331,6 +7625,24 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_atomicrmw(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
; GFX940-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_workgroup_one_as_monotonic_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_workgroup_one_as_monotonic_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("workgroup-one-as") monotonic
@@ -6422,6 +7734,24 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_atomicrmw(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
; GFX940-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_workgroup_one_as_acquire_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_workgroup_one_as_acquire_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("workgroup-one-as") acquire
@@ -6513,6 +7843,24 @@ define amdgpu_kernel void @local_workgroup_one_as_release_atomicrmw(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
; GFX940-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_workgroup_one_as_release_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_workgroup_one_as_release_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("workgroup-one-as") release
@@ -6604,6 +7952,24 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_atomicrmw(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
; GFX940-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_workgroup_one_as_acq_rel_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_workgroup_one_as_acq_rel_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("workgroup-one-as") acq_rel
@@ -6695,6 +8061,24 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_atomicrmw(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s1
; GFX940-TGSPLIT-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_workgroup_one_as_seq_cst_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_workgroup_one_as_seq_cst_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("workgroup-one-as") seq_cst
@@ -6804,6 +8188,28 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_ret_atomicrmw(
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_workgroup_one_as_acquire_ret_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v1, v0, v1
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: ds_store_b32 v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_workgroup_one_as_acquire_ret_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_store_b32 v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("workgroup-one-as") acquire
@@ -6914,6 +8320,28 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_ret_atomicrmw(
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_workgroup_one_as_acq_rel_ret_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v1, v0, v1
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: ds_store_b32 v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_workgroup_one_as_acq_rel_ret_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_store_b32 v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("workgroup-one-as") acq_rel
@@ -7024,6 +8452,28 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_ret_atomicrmw(
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_workgroup_one_as_seq_cst_ret_atomicrmw:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v1, v0, v1
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: ds_store_b32 v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_workgroup_one_as_seq_cst_ret_atomicrmw:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_store_b32 v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in) {
entry:
%val = atomicrmw volatile xchg i32 addrspace(3)* %out, i32 %in syncscope("workgroup-one-as") seq_cst
@@ -7125,6 +8575,26 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_monotonic_cmpxchg(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_workgroup_one_as_monotonic_monotonic_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_workgroup_one_as_monotonic_monotonic_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -7226,6 +8696,26 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_monotonic_cmpxchg(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_workgroup_one_as_acquire_monotonic_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_workgroup_one_as_acquire_monotonic_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -7327,6 +8817,26 @@ define amdgpu_kernel void @local_workgroup_one_as_release_monotonic_cmpxchg(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_workgroup_one_as_release_monotonic_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_workgroup_one_as_release_monotonic_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -7428,6 +8938,26 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_monotonic_cmpxchg(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_workgroup_one_as_acq_rel_monotonic_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_workgroup_one_as_acq_rel_monotonic_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -7529,6 +9059,26 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_monotonic_cmpxchg(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_workgroup_one_as_seq_cst_monotonic_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_workgroup_one_as_seq_cst_monotonic_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -7630,6 +9180,26 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_acquire_cmpxchg(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_workgroup_one_as_monotonic_acquire_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_workgroup_one_as_monotonic_acquire_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -7731,6 +9301,26 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_acquire_cmpxchg(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_workgroup_one_as_acquire_acquire_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_workgroup_one_as_acquire_acquire_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -7832,6 +9422,26 @@ define amdgpu_kernel void @local_workgroup_one_as_release_acquire_cmpxchg(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_workgroup_one_as_release_acquire_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_workgroup_one_as_release_acquire_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -7933,6 +9543,26 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_acquire_cmpxchg(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_workgroup_one_as_acq_rel_acquire_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_workgroup_one_as_acq_rel_acquire_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -8034,6 +9664,26 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_acquire_cmpxchg(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_workgroup_one_as_seq_cst_acquire_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_workgroup_one_as_seq_cst_acquire_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -8135,6 +9785,26 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_seq_cst_cmpxchg(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_workgroup_one_as_monotonic_seq_cst_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_workgroup_one_as_monotonic_seq_cst_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -8236,6 +9906,26 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_seq_cst_cmpxchg(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_workgroup_one_as_acquire_seq_cst_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_workgroup_one_as_acquire_seq_cst_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -8337,6 +10027,26 @@ define amdgpu_kernel void @local_workgroup_one_as_release_seq_cst_cmpxchg(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_workgroup_one_as_release_seq_cst_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_workgroup_one_as_release_seq_cst_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -8438,6 +10148,26 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_seq_cst_cmpxchg(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_workgroup_one_as_acq_rel_seq_cst_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_workgroup_one_as_acq_rel_seq_cst_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -8539,6 +10269,26 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_seq_cst_cmpxchg(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s1
; GFX940-TGSPLIT-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -8658,6 +10408,30 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_monotonic_ret_cmpxch
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_workgroup_one_as_monotonic_monotonic_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: ds_store_b32 v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_workgroup_one_as_monotonic_monotonic_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_store_b32 v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -8779,6 +10553,30 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_monotonic_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_workgroup_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: ds_store_b32 v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_workgroup_one_as_acquire_monotonic_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_store_b32 v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -8900,6 +10698,30 @@ define amdgpu_kernel void @local_workgroup_one_as_release_monotonic_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_workgroup_one_as_release_monotonic_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: ds_store_b32 v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_workgroup_one_as_release_monotonic_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_store_b32 v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -9021,6 +10843,30 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: ds_store_b32 v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_store_b32 v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -9142,6 +10988,30 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: ds_store_b32 v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_store_b32 v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -9263,6 +11133,30 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_acquire_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_workgroup_one_as_monotonic_acquire_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: ds_store_b32 v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_workgroup_one_as_monotonic_acquire_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_store_b32 v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -9384,6 +11278,30 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_acquire_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_workgroup_one_as_acquire_acquire_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: ds_store_b32 v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_workgroup_one_as_acquire_acquire_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_store_b32 v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -9505,6 +11423,30 @@ define amdgpu_kernel void @local_workgroup_one_as_release_acquire_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_workgroup_one_as_release_acquire_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: ds_store_b32 v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_workgroup_one_as_release_acquire_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_store_b32 v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -9626,6 +11568,30 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_acquire_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_workgroup_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: ds_store_b32 v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_workgroup_one_as_acq_rel_acquire_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_store_b32 v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -9747,6 +11713,30 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_acquire_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_workgroup_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: ds_store_b32 v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_workgroup_one_as_seq_cst_acquire_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_store_b32 v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -9868,6 +11858,30 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: ds_store_b32 v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_store_b32 v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -9989,6 +12003,30 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_seq_cst_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_workgroup_one_as_acquire_seq_cst_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: ds_store_b32 v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_workgroup_one_as_acquire_seq_cst_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_store_b32 v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -10110,6 +12148,30 @@ define amdgpu_kernel void @local_workgroup_one_as_release_seq_cst_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_workgroup_one_as_release_seq_cst_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: ds_store_b32 v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_workgroup_one_as_release_seq_cst_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_store_b32 v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -10231,6 +12293,30 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: ds_store_b32 v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_store_b32 v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
@@ -10352,6 +12438,30 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg(
; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-TGSPLIT-NEXT: ds_write_b32 v0, v1
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: local_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: ds_store_b32 v0, v1
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: local_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
+; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: ds_store_b32 v0, v1
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(3)* %out, i32 %in, i32 %old) {
entry:
%gep = getelementptr i32, i32 addrspace(3)* %out, i32 4
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-nontemporal.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-nontemporal.ll
index 00c74ff0839d..c4261a9e15c3 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-nontemporal.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-nontemporal.ll
@@ -8,6 +8,8 @@
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=+tgsplit -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-TGSPLIT %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-NOTTGSPLIT %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -mattr=+tgsplit -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-TGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11-WGP %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+cumode -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11-CU %s
define amdgpu_kernel void @private_nontemporal_load_0(
; GFX6-LABEL: private_nontemporal_load_0:
@@ -149,6 +151,30 @@ define amdgpu_kernel void @private_nontemporal_load_0(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: global_store_dword v1, v0, s[2:3]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_nontemporal_load_0:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s2 slc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_nontemporal_load_0:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s2 slc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(5)* %in, i32 addrspace(1)* %out) {
entry:
%val = load i32, i32 addrspace(5)* %in, align 4, !nontemporal !0
@@ -301,6 +327,30 @@ define amdgpu_kernel void @private_nontemporal_load_1(
; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
; GFX940-TGSPLIT-NEXT: global_store_dword v1, v0, s[2:3]
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_nontemporal_load_1:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX11-WGP-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: scratch_load_b32 v0, v0, s2 slc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_nontemporal_load_1:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX11-CU-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: scratch_load_b32 v0, v0, s2 slc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(5)* %in, i32 addrspace(1)* %out) {
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -448,6 +498,30 @@ define amdgpu_kernel void @private_nontemporal_store_0(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0
; GFX940-TGSPLIT-NEXT: scratch_store_dword off, v0, s4 nt
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_nontemporal_store_0:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 glc slc
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_nontemporal_store_0:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_load_b32 s1, s[2:3], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 glc slc
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %in, i32 addrspace(5)* %out) {
entry:
%val = load i32, i32 addrspace(1)* %in, align 4
@@ -598,6 +672,32 @@ define amdgpu_kernel void @private_nontemporal_store_1(
; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0
; GFX940-TGSPLIT-NEXT: scratch_store_dword v0, v1, s4 nt
; GFX940-TGSPLIT-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_nontemporal_store_1:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8
+; GFX11-WGP-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: scratch_store_b32 v0, v1, s0 glc slc
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_nontemporal_store_1:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8
+; GFX11-CU-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_load_b32 s1, s[2:3], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: scratch_store_b32 v0, v1, s0 glc slc
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %in, i32 addrspace(5)* %out) {
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-volatile.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-volatile.ll
index f60551438207..973c830fcbbe 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-volatile.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-volatile.ll
@@ -4,6 +4,8 @@
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-WGP %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+cumode -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-CU %s
; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx700 -amdgcn-skip-cache-invalidations -verify-machineinstrs < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11-WGP %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+cumode -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11-CU %s
define amdgpu_kernel void @private_volatile_load_0(
; GFX6-LABEL: private_volatile_load_0:
@@ -93,6 +95,30 @@ define amdgpu_kernel void @private_volatile_load_0(
; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1
; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_volatile_load_0:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s2 glc dlc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_volatile_load_0:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: scratch_load_b32 v0, off, s2 glc dlc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(5)* %in, i32 addrspace(1)* %out) {
entry:
%val = load volatile i32, i32 addrspace(5)* %in, align 4
@@ -191,6 +217,30 @@ define amdgpu_kernel void @private_volatile_load_1(
; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1
; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_volatile_load_1:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX11-WGP-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, 0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: scratch_load_b32 v0, v0, s2 glc dlc
+; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
+; GFX11-WGP-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_volatile_load_1:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x0
+; GFX11-CU-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, 0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: scratch_load_b32 v0, v0, s2 glc dlc
+; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
+; GFX11-CU-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(5)* %in, i32 addrspace(1)* %out) {
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -291,6 +341,32 @@ define amdgpu_kernel void @private_volatile_store_0(
; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, v1, s[4:7], 0 offen
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_volatile_store_0:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_volatile_store_0:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_load_b32 s1, s[2:3], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1
+; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %in, i32 addrspace(5)* %out) {
entry:
%val = load i32, i32 addrspace(1)* %in, align 4
@@ -392,6 +468,34 @@ define amdgpu_kernel void @private_volatile_store_1(
; SKIP-CACHE-INV-NEXT: buffer_store_dword v1, v0, s[4:7], 0 offen
; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
; SKIP-CACHE-INV-NEXT: s_endpgm
+;
+; GFX11-WGP-LABEL: private_volatile_store_1:
+; GFX11-WGP: ; %bb.0: ; %entry
+; GFX11-WGP-NEXT: s_clause 0x1
+; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8
+; GFX11-WGP-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0
+; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-WGP-NEXT: scratch_store_b32 v0, v1, s0
+; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-WGP-NEXT: s_endpgm
+;
+; GFX11-CU-LABEL: private_volatile_store_1:
+; GFX11-CU: ; %bb.0: ; %entry
+; GFX11-CU-NEXT: s_clause 0x1
+; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8
+; GFX11-CU-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: s_load_b32 s1, s[2:3], 0x0
+; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-CU-NEXT: scratch_store_b32 v0, v1, s0
+; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-CU-NEXT: s_endpgm
i32 addrspace(1)* %in, i32 addrspace(5)* %out) {
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
More information about the llvm-commits
mailing list